Spaces:

mimi111222
/

_phisingdector

Sleeping

App Files Files Community

mimi111222 commited on Oct 29, 2025

Commit

844ac8e

verified ·

1 Parent(s): 68431df

Update app.py

Browse files

Files changed (1) hide show

app.py +373 -191

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 import matplotlib.pyplot as plt
 import seaborn as sns
 import io
@@ -19,12 +21,20 @@ import os
 # Page Configuration
 st.set_page_config(
-    page_title="AI Phishing Shield – by Umaima Qureshi",
     layout="wide",
     initial_sidebar_state="collapsed"
 )
-# Premium Black & Gold CSS Styling - ENHANCED VERSION
 st.markdown("""
 <style>
 @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800;900&display=swap');
@@ -51,16 +61,13 @@ section[data-testid="stSidebar"] {
     display: none;
 }
-/* Hero Section - Enhanced */
 .hero-container {
     background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
     border-radius: 32px;
     padding: 4rem 3rem;
     margin-bottom: 3rem;
-    box-shadow:
-        0 25px 70px rgba(0,0,0,0.6),
-        0 10px 30px rgba(218,165,32,0.25),
-        inset 0 1px 0 rgba(255,255,255,0.1);
     position: relative;
     overflow: hidden;
     border: 2px solid rgba(218,165,32,0.4);
@@ -105,7 +112,6 @@ section[data-testid="stSidebar"] {
     z-index: 1;
     letter-spacing: -0.03em;
     filter: drop-shadow(0 4px 20px rgba(255,215,0,0.4));
-    text-shadow: 0 0 80px rgba(255,215,0,0.3);
 }
 .hero-subtitle {
@@ -137,9 +143,7 @@ section[data-testid="stSidebar"] {
     font-size: 1.05rem;
     font-weight: 700;
     margin-top: 1.8rem;
-    box-shadow:
-        0 8px 25px rgba(255,215,0,0.5),
-        0 0 40px rgba(255,215,0,0.3);
     position: relative;
     z-index: 1;
     transition: all 0.3s ease;
@@ -147,9 +151,7 @@ section[data-testid="stSidebar"] {
 .hero-badge:hover {
     transform: translateY(-2px);
-    box-shadow:
-        0 12px 35px rgba(255,215,0,0.6),
-        0 0 50px rgba(255,215,0,0.4);
 }
 /* Section Headers */
@@ -178,7 +180,7 @@ section[data-testid="stSidebar"] {
     border-radius: 2px;
 }
-/* Stats Grid - Enhanced */
 .stats-grid {
     display: grid;
     grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
@@ -192,10 +194,7 @@ section[data-testid="stSidebar"] {
     border-radius: 24px;
     text-align: center;
     color: #0f0f0f;
-    box-shadow:
-        0 10px 30px rgba(255,215,0,0.35),
-        0 0 40px rgba(255,215,0,0.2),
-        inset 0 1px 0 rgba(255,255,255,0.3);
     transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
     position: relative;
     overflow: hidden;
@@ -220,10 +219,7 @@ section[data-testid="stSidebar"] {
 .stat-card:hover {
     transform: translateY(-10px) scale(1.03);
-    box-shadow:
-        0 20px 50px rgba(255,215,0,0.5),
-        0 0 60px rgba(255,215,0,0.3),
-        inset 0 1px 0 rgba(255,255,255,0.4);
 }
 .stat-value {
@@ -247,7 +243,7 @@ section[data-testid="stSidebar"] {
     color: #0f0f0f;
 }
-/* Input Areas - Enhanced */
 .stTextArea textarea {
     border-radius: 18px;
     border: 2px solid rgba(218,165,32,0.35);
@@ -265,7 +261,7 @@ section[data-testid="stSidebar"] {
     background: rgba(26,26,26,0.95) !important;
 }
-/* Buttons - Enhanced */
 .stButton > button {
     background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
     color: #0f0f0f;
@@ -275,9 +271,7 @@ section[data-testid="stSidebar"] {
     font-size: 1.15rem;
     font-weight: 700;
     transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
-    box-shadow:
-        0 4px 15px rgba(255,215,0,0.4),
-        0 0 30px rgba(255,215,0,0.2);
     width: 100%;
     letter-spacing: 0.5px;
     position: relative;
@@ -304,42 +298,22 @@ section[data-testid="stSidebar"] {
 .stButton > button:hover {
     transform: translateY(-3px);
-    box-shadow:
-        0 8px 25px rgba(255,215,0,0.6),
-        0 0 50px rgba(255,215,0,0.3);
 }
 .stButton > button:active {
     transform: translateY(-1px);
 }
-/* Alert Boxes - Enhanced */
-.alert-danger {
-    background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
-    color: white;
     padding: 2rem;
     border-radius: 20px;
     font-size: 1.1rem;
     font-weight: 600;
-    box-shadow:
-        0 10px 30px rgba(239,68,68,0.4),
-        0 0 50px rgba(239,68,68,0.2);
     margin: 1.5rem 0;
     border: 2px solid rgba(255,255,255,0.1);
-}
-.alert-success {
-    background: linear-gradient(135deg, #10b981 0%, #059669 100%);
     color: white;
-    padding: 2rem;
-    border-radius: 20px;
-    font-size: 1.1rem;
-    font-weight: 600;
-    box-shadow:
-        0 10px 30px rgba(16,185,129,0.4),
-        0 0 50px rgba(16,185,129,0.2);
-    margin: 1.5rem 0;
-    border: 2px solid rgba(255,255,255,0.1);
 }
 .confidence-bar {
@@ -359,15 +333,13 @@ section[data-testid="stSidebar"] {
     box-shadow: 0 0 10px rgba(255,255,255,0.5);
 }
-/* Hints Panel - Enhanced */
 .hints-panel {
     background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
     border-radius: 20px;
     padding: 2rem;
     border-left: 5px solid #FFD700;
-    box-shadow:
-        0 4px 15px rgba(0,0,0,0.4),
-        inset 0 1px 0 rgba(255,255,255,0.05);
     backdrop-filter: blur(10px);
 }
@@ -395,26 +367,22 @@ section[data-testid="stSidebar"] {
     box-shadow: 0 2px 8px rgba(255,215,0,0.4);
 }
-/* Metric Cards - Enhanced */
 .metric-container {
     background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
     padding: 1.8rem;
     border-radius: 16px;
     border-left: 5px solid #FFD700;
-    box-shadow:
-        0 4px 12px rgba(0,0,0,0.4),
-        inset 0 1px 0 rgba(255,255,255,0.05);
     transition: all 0.3s ease;
 }
 .metric-container:hover {
     transform: translateY(-2px);
-    box-shadow:
-        0 6px 18px rgba(0,0,0,0.5),
-        inset 0 1px 0 rgba(255,255,255,0.08);
 }
-/* File Uploader - Enhanced */
 .stFileUploader {
     border: 2px dashed rgba(218,165,32,0.45);
     border-radius: 18px;
@@ -429,7 +397,7 @@ section[data-testid="stSidebar"] {
     box-shadow: 0 0 20px rgba(255,215,0,0.15);
 }
-/* Expanders - Enhanced */
 .streamlit-expanderHeader {
     background: linear-gradient(135deg, rgba(218,165,32,0.2) 0%, rgba(218,165,32,0.1) 100%) !important;
     border-radius: 14px !important;
@@ -489,7 +457,7 @@ section[data-testid="stSidebar"] {
     color: #e5e7eb !important;
 }
-/* Footer - Enhanced */
 .footer {
     background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
     border-radius: 20px;
@@ -497,9 +465,7 @@ section[data-testid="stSidebar"] {
     text-align: center;
     margin-top: 4rem;
     color: #9ca3af;
-    box-shadow:
-        0 8px 24px rgba(0,0,0,0.4),
-        inset 0 1px 0 rgba(255,255,255,0.05);
     border: 2px solid rgba(218,165,32,0.3);
 }
@@ -560,23 +526,136 @@ def safe_read_csv(path):
     except Exception as e:
         return pd.DataFrame()
 def preprocess_text(text):
     if not isinstance(text, str):
         text = str(text)
     text = text.lower()
-    text = re.sub(r'http\S+|www\S+|https\S+', ' url ', text)
-    text = re.sub(r'\S+@\S+', ' email ', text)
     text = re.sub(r'[^a-z\s]', ' ', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
 # Hero Header
 st.markdown("""
 <div class="hero-container">
     <div class="hero-title">🛡️ AI Phishing Shield</div>
     <div class="hero-subtitle">Advanced Machine Learning Protection Against Email Threats</div>
     <div class="hero-description">
-        Powered by TF-IDF vectorization and Logistic Regression, trained on thousands of real-world phishing examples.
         Get instant threat analysis with confidence scoring and explainable AI insights.
     </div>
     <div class="hero-badge">⚡ Developed by Umaima Qureshi</div>
@@ -617,6 +696,12 @@ else:
         ]
     })
 # Clean & Prepare Dataset
 if "Unnamed: 0" in df.columns:
     df = df.drop(columns=["Unnamed: 0"])
@@ -671,6 +756,7 @@ with st.expander("🔍 View Dataset Preview", expanded=False):
 # Model Training
 @st.cache_resource
 def train_model(processed_texts, labels, test_size=0.2, random_state=42):
     unique_labels, counts = np.unique(labels, return_counts=True)
     min_samples = counts.min()
@@ -693,18 +779,31 @@ def train_model(processed_texts, labels, test_size=0.2, random_state=42):
                 processed_texts, labels, test_size=test_size, random_state=random_state, stratify=None
             )
-    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
     X_train_vec = vectorizer.fit_transform(X_train)
     X_test_vec = vectorizer.transform(X_test)
-    model = LogisticRegression(max_iter=1000, solver='liblinear')
     model.fit(X_train_vec, y_train)
     y_pred = model.predict(X_test_vec)
     acc = accuracy_score(y_test, y_pred)
     cm = confusion_matrix(y_test, y_pred)
     report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
     return {
         "vectorizer": vectorizer,
         "model": model,
@@ -713,8 +812,17 @@ def train_model(processed_texts, labels, test_size=0.2, random_state=42):
         "report": report
     }
-model_info = train_model(df['processed_text'].tolist(), df['label'].values)
-vectorizer, model, accuracy = model_info["vectorizer"], model_info["model"], model_info["accuracy"]
 # Model Performance
 st.markdown('<div class="section-title">🎯 Model Performance</div>', unsafe_allow_html=True)
@@ -747,53 +855,23 @@ with col3:
     </div>
     """, unsafe_allow_html=True)
-# Fixed Confusion Matrix Section - No re-rendering
 with st.expander("📈 Detailed Metrics & Confusion Matrix"):
     col_matrix, col_report = st.columns([1, 1.5])
     with col_matrix:
-        # Use container to prevent re-rendering
-        with st.container():
-            # Set dark theme for matplotlib
-            plt.style.use('dark_background')
-            fig, ax = plt.subplots(figsize=(5, 4), facecolor='#1a1a1a')
-            ax.set_facecolor('#1a1a1a')
-            sns.heatmap(
-                model_info["confusion_matrix"],
-                annot=True,
-                fmt="d",
-                ax=ax,
-                cmap="YlOrBr",
-                cbar=True,
-                square=True,
-                annot_kws={"size": 16, "weight": "bold", "color": "#0f0f0f"},
-                linewidths=2,
-                linecolor='#0f0f0f',
-                cbar_kws={'label': 'Count', 'shrink': 0.8}
-            )
-            ax.set_xlabel("Predicted", fontsize=11, fontweight='bold', color='#FFD700')
-            ax.set_ylabel("Actual", fontsize=11, fontweight='bold', color='#FFD700')
-            ax.set_xticklabels(["Safe", "Phishing"], fontsize=10, color='#e5e7eb')
-            ax.set_yticklabels(["Safe", "Phishing"], fontsize=10, rotation=0, color='#e5e7eb')
-            ax.set_title("Confusion Matrix", fontsize=13, fontweight='bold', pad=12, color='#FFD700')
-            # Style the colorbar
-            cbar = ax.collections[0].colorbar
-            cbar.ax.yaxis.set_tick_params(color='#e5e7eb')
-            plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e5e7eb')
-            plt.tight_layout()
-            st.pyplot(fig, use_container_width=True)
-            plt.close(fig)  # Close figure to prevent memory leak
     with col_report:
         st.markdown("**📊 Classification Report:**")
         report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
         st.dataframe(
-            report_df,
-            use_container_width=True,
             height=250
         )
@@ -817,82 +895,169 @@ with col_input:
             email_input = uploaded_txt.read().decode("utf-8", errors="ignore")
         except Exception:
             email_input = str(uploaded_txt.getvalue())
     if st.button("🔍 Analyze Email Threat"):
         if not email_input.strip():
             st.warning("⚠️ Please paste or upload email content to analyze")
         else:
-            with st.spinner("🔍 Analyzing email threat..."):
-                processed_input = preprocess_text(email_input)
-                input_vec = vectorizer.transform([processed_input])
-                try:
-                    proba = model.predict_proba(input_vec)[0][1]
-                except Exception:
                     try:
-                        score = model.decision_function(input_vec)[0]
-                        proba = 1/(1+np.exp(-score))
-                    except Exception:
-                        proba = None
-                pred = model.predict(input_vec)[0]
-                if pred == 1:
-                    conf_pct = f"{proba:.1%}" if proba is not None else "N/A"
-                    st.markdown(f"""
-                    <div class="alert-danger">
-                        <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
-                            <div style="font-size: 2.5rem;">⚠️</div>
-                            <div>
-                                <div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">PHISHING DETECTED</div>
-                                <div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Threat Confidence: {conf_pct}</div>
                             </div>
-                        </div>
-                        <div class="confidence-bar">
-                            <div class="confidence-fill" style="width: {proba*100 if proba else 0}%;"></div>
-                        </div>
-                    </div>
-                    """, unsafe_allow_html=True)
-                    st.markdown("**🔍 Threat Indicators Detected:**")
-                    indicators = []
-                    if "url" in processed_input:
-                        indicators.append("🔗 Suspicious URL tokens detected")
-                    if re.search(r'\b(urgent|immediately|verify|password|suspended|click|act now|action required)\b', processed_input):
-                        indicators.append("⚡ Urgency manipulation tactics")
-                    if re.search(r'\b(bank|account|verify|login|password|security|credential|paypal)\b', processed_input):
-                        indicators.append("🏦 Financial/security keywords present")
-                    if re.search(r'\b(winner|prize|congratulations|claim|free|won)\b', processed_input):
-                        indicators.append("🎁 Reward/prize baiting language")
-                    if re.search(r'\b(confirm|update|validate|unlock|restore)\b', processed_input):
-                        indicators.append("🔐 Account action requests")
-                    for indicator in indicators:
-                        st.markdown(f"- {indicator}")
-                    if not indicators:
-                        st.markdown("- ⚠️ Content pattern matches known phishing templates")
-                    st.error("🚨 **Recommendation:** Do NOT click any links. Delete this email immediately and report to your IT security team.")
-                else:
-                    conf_pct = f"{(1-proba):.1%}" if proba is not None else "N/A"
-                    st.markdown(f"""
-                    <div class="alert-success">
-                        <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
-                            <div style="font-size: 2.5rem;">✅</div>
-                            <div>
-                                <div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">EMAIL APPEARS SAFE</div>
-                                <div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Safety Confidence: {conf_pct}</div>
                             </div>
-                        </div>
-                        <div class="confidence-bar">
-                            <div class="confidence-fill" style="width: {(1-proba)*100 if proba else 100}%;"></div>
-                        </div>
-                    </div>
-                    """, unsafe_allow_html=True)
-                    st.markdown("**✓ No obvious threat indicators found in content analysis**")
-                    st.info("💡 **Best Practice:** Always verify sender identity through known contact methods and be cautious with unexpected emails, even if they appear safe.")
 with col_hints:
     st.markdown("""
@@ -924,6 +1089,11 @@ with col_hints:
             <div><strong>Prize/reward</strong> language is a common phishing tactic</div>
         </div>
         <div class="hint-item">
             <div class="hint-icon">⚠️</div>
             <div><strong>Limitations:</strong> This tool analyzes text content only. Always verify sender identity separately.</div>
@@ -931,6 +1101,18 @@ with col_hints:
     </div>
     """, unsafe_allow_html=True)
 # Additional Tips Section
 st.markdown('<div class="section-title">💡 Phishing Protection Tips</div>', unsafe_allow_html=True)
@@ -938,8 +1120,8 @@ col_tip1, col_tip2, col_tip3 = st.columns(3)
 with col_tip1:
     st.markdown("""
-    <div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
-                padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
                 box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
         <div style="font-size: 2rem; margin-bottom: 0.75rem;">🔍</div>
         <div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Verify Sender</div>
@@ -951,8 +1133,8 @@ with col_tip1:
 with col_tip2:
     st.markdown("""
-    <div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
-                padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
                 box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
         <div style="font-size: 2rem; margin-bottom: 0.75rem;">🔗</div>
         <div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Hover Links</div>
@@ -964,8 +1146,8 @@ with col_tip2:
 with col_tip3:
     st.markdown("""
-    <div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
-                padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
                 box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
         <div style="font-size: 2rem; margin-bottom: 0.75rem;">📞</div>
         <div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Contact Directly</div>
@@ -986,7 +1168,7 @@ st.markdown("""
         For production use: Implement additional verification layers, link scanning, attachment analysis, and human oversight
     </div>
     <div style="margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid rgba(218,165,32,0.2); font-size: 0.9rem; color: #6b7280;">
-        Powered by TF-IDF • Logistic Regression • Scikit-learn • Streamlit
     </div>
     <div style="margin-top: 1rem; font-size: 0.85rem; color: #6b7280;">
         © 2024 AI Phishing Shield | All Rights Reserved

 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend
 import matplotlib.pyplot as plt
 import seaborn as sns
 import io
 # Page Configuration
 st.set_page_config(
+    page_title="AI Phishing Shield – by Umaima Qureshi",
     layout="wide",
     initial_sidebar_state="collapsed"
 )
+# Initialize Session State
+if 'model_trained' not in st.session_state:
+    st.session_state.model_trained = False
+if 'analysis_history' not in st.session_state:
+    st.session_state.analysis_history = []
+if 'cm_plot_cached' not in st.session_state:
+    st.session_state.cm_plot_cached = None
+# Premium Black & Gold CSS Styling
 st.markdown("""
 <style>
 @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800;900&display=swap');
     display: none;
 }
+/* Hero Section */
 .hero-container {
     background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
     border-radius: 32px;
     padding: 4rem 3rem;
     margin-bottom: 3rem;
+    box-shadow: 0 25px 70px rgba(0,0,0,0.6), 0 10px 30px rgba(218,165,32,0.25), inset 0 1px 0 rgba(255,255,255,0.1);
     position: relative;
     overflow: hidden;
     border: 2px solid rgba(218,165,32,0.4);
     z-index: 1;
     letter-spacing: -0.03em;
     filter: drop-shadow(0 4px 20px rgba(255,215,0,0.4));
 }
 .hero-subtitle {
     font-size: 1.05rem;
     font-weight: 700;
     margin-top: 1.8rem;
+    box-shadow: 0 8px 25px rgba(255,215,0,0.5), 0 0 40px rgba(255,215,0,0.3);
     position: relative;
     z-index: 1;
     transition: all 0.3s ease;
 .hero-badge:hover {
     transform: translateY(-2px);
+    box-shadow: 0 12px 35px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.4);
 }
 /* Section Headers */
     border-radius: 2px;
 }
+/* Stats Grid */
 .stats-grid {
     display: grid;
     grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
     border-radius: 24px;
     text-align: center;
     color: #0f0f0f;
+    box-shadow: 0 10px 30px rgba(255,215,0,0.35), 0 0 40px rgba(255,215,0,0.2), inset 0 1px 0 rgba(255,255,255,0.3);
     transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
     position: relative;
     overflow: hidden;
 .stat-card:hover {
     transform: translateY(-10px) scale(1.03);
+    box-shadow: 0 20px 50px rgba(255,215,0,0.5), 0 0 60px rgba(255,215,0,0.3), inset 0 1px 0 rgba(255,255,255,0.4);
 }
 .stat-value {
     color: #0f0f0f;
 }
+/* Input Areas */
 .stTextArea textarea {
     border-radius: 18px;
     border: 2px solid rgba(218,165,32,0.35);
     background: rgba(26,26,26,0.95) !important;
 }
+/* Buttons */
 .stButton > button {
     background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
     color: #0f0f0f;
     font-size: 1.15rem;
     font-weight: 700;
     transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+    box-shadow: 0 4px 15px rgba(255,215,0,0.4), 0 0 30px rgba(255,215,0,0.2);
     width: 100%;
     letter-spacing: 0.5px;
     position: relative;
 .stButton > button:hover {
     transform: translateY(-3px);
+    box-shadow: 0 8px 25px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.3);
 }
 .stButton > button:active {
     transform: translateY(-1px);
 }
+/* Dynamic Alert Boxes */
+.alert-box {
     padding: 2rem;
     border-radius: 20px;
     font-size: 1.1rem;
     font-weight: 600;
     margin: 1.5rem 0;
     border: 2px solid rgba(255,255,255,0.1);
     color: white;
 }
 .confidence-bar {
     box-shadow: 0 0 10px rgba(255,255,255,0.5);
 }
+/* Hints Panel */
 .hints-panel {
     background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
     border-radius: 20px;
     padding: 2rem;
     border-left: 5px solid #FFD700;
+    box-shadow: 0 4px 15px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
     backdrop-filter: blur(10px);
 }
     box-shadow: 0 2px 8px rgba(255,215,0,0.4);
 }
+/* Metric Cards */
 .metric-container {
     background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
     padding: 1.8rem;
     border-radius: 16px;
     border-left: 5px solid #FFD700;
+    box-shadow: 0 4px 12px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
     transition: all 0.3s ease;
 }
 .metric-container:hover {
     transform: translateY(-2px);
+    box-shadow: 0 6px 18px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.08);
 }
+/* File Uploader */
 .stFileUploader {
     border: 2px dashed rgba(218,165,32,0.45);
     border-radius: 18px;
     box-shadow: 0 0 20px rgba(255,215,0,0.15);
 }
+/* Expanders */
 .streamlit-expanderHeader {
     background: linear-gradient(135deg, rgba(218,165,32,0.2) 0%, rgba(218,165,32,0.1) 100%) !important;
     border-radius: 14px !important;
     color: #e5e7eb !important;
 }
+/* Footer */
 .footer {
     background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
     border-radius: 20px;
     text-align: center;
     margin-top: 4rem;
     color: #9ca3af;
+    box-shadow: 0 8px 24px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
     border: 2px solid rgba(218,165,32,0.3);
 }
     except Exception as e:
         return pd.DataFrame()
+def sanitize_input(text):
+    """Sanitize user input to prevent injection"""
+    text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<.*?>', '', text)
+    return text
+def validate_email_input(text):
+    """Validate email input"""
+    if len(text.strip()) < 10:
+        return False, "Email content too short for analysis (minimum 10 characters)"
+    if len(text) > 10000:
+        return False, "Email content too long (maximum 10,000 characters)"
+    return True, ""
+@st.cache_data
+def preprocess_text_cached(text):
+    """Cached version of text preprocessing"""
+    return preprocess_text(text)
 def preprocess_text(text):
+    """Enhanced preprocessing with better phishing indicator preservation"""
     if not isinstance(text, str):
         text = str(text)
     text = text.lower()
+    # Enhanced URL detection - preserve URL patterns better
+    text = re.sub(r'http\S+|www\S+|https\S+', ' suspiciousurl ', text)
+    text = re.sub(r'\S+@\S+', ' emailaddress ', text)
+    # Preserve important phishing indicators
+    text = re.sub(r'\$\d+', ' moneymention ', text)
+    text = re.sub(r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}', ' cardnumber ', text)
     text = re.sub(r'[^a-z\s]', ' ', text)
     text = re.sub(r'\s+', ' ', text).strip()
     return text
+def calculate_phishing_score(text):
+    """Enhanced phishing detection with multi-factor scoring"""
+    score = 0
+    text_lower = text.lower()
+    # High-risk phishing keywords (weight: 15 points each)
+    high_risk = ['verify', 'suspended', 'urgent', 'immediately', 'click here', 'act now',
+                 'confirm identity', 'account locked', 'unusual activity', 'security alert',
+                 'expire', 'limited time', 'action required', 'update payment', 'validate']
+    score += sum(15 for word in high_risk if word in text_lower)
+    # Financial/security keywords (weight: 12 points each)
+    financial = ['bank', 'credit card', 'password', 'ssn', 'social security', 'paypal',
+                 'billing', 'payment', 'account number', 'pin', 'cvv', 'credential']
+    score += sum(12 for word in financial if word in text_lower)
+    # Prize/reward scam indicators (weight: 18 points each)
+    prize_scam = ['won', 'winner', 'prize', 'claim now', 'congratulations', 'free money',
+                  'inheritance', 'lottery', 'jackpot', 'cash prize', '$1000', '$10000']
+    score += sum(18 for word in prize_scam if word in text_lower)
+    # Urgency + financial combo (weight: 25 points)
+    if any(urg in text_lower for urg in ['urgent', 'immediately', 'now', 'expire']) and \
+       any(fin in text_lower for fin in ['account', 'bank', 'payment', 'card']):
+        score += 25
+    # Suspicious URL patterns (weight: 20 points)
+    if re.search(r'http\S+|www\S+', text, re.IGNORECASE):
+        url_count = len(re.findall(r'http\S+|www\S+', text, re.IGNORECASE))
+        score += min(url_count * 20, 40)  # Cap at 40 for multiple URLs
+    # Request for credentials/info (weight: 20 points)
+    if re.search(r'\b(enter|provide|submit|update|confirm).{0,20}(password|credential|info|detail)', text_lower):
+        score += 20
+    # Threatening language (weight: 15 points)
+    threats = ['locked', 'suspended', 'terminated', 'closed', 'blocked', 'restricted']
+    score += sum(15 for word in threats if word in text_lower)
+    # Poor grammar indicators (weight: 8 points)
+    if re.search(r'\b(dear customer|dear user|dear member|dear valued)\b', text_lower):
+        score += 8
+    # Convert to probability (0-1 scale)
+    max_score = 200  # Adjusted maximum possible score
+    probability = min(score / max_score, 0.99)  # Cap at 99%
+    return probability
+@st.cache_data
+def generate_confusion_matrix_plot(_cm):
+    """Generate confusion matrix plot once and cache it"""
+    plt.style.use('dark_background')
+    fig, ax = plt.subplots(figsize=(5, 4), facecolor='#1a1a1a')
+    ax.set_facecolor('#1a1a1a')
+    sns.heatmap(
+        _cm,
+        annot=True,
+        fmt="d",
+        ax=ax,
+        cmap="YlOrBr",
+        cbar=True,
+        square=True,
+        annot_kws={"size": 16, "weight": "bold", "color": "#0f0f0f"},
+        linewidths=2,
+        linecolor='#0f0f0f',
+        cbar_kws={'label': 'Count', 'shrink': 0.8}
+    )
+    ax.set_xlabel("Predicted", fontsize=11, fontweight='bold', color='#FFD700')
+    ax.set_ylabel("Actual", fontsize=11, fontweight='bold', color='#FFD700')
+    ax.set_xticklabels(["Safe", "Phishing"], fontsize=10, color='#e5e7eb')
+    ax.set_yticklabels(["Safe", "Phishing"], fontsize=10, rotation=0, color='#e5e7eb')
+    ax.set_title("Confusion Matrix", fontsize=13, fontweight='bold', pad=12, color='#FFD700')
+    # Style the colorbar
+    cbar = ax.collections[0].colorbar
+    cbar.ax.yaxis.set_tick_params(color='#e5e7eb')
+    plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e5e7eb')
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', facecolor='#1a1a1a', dpi=100)
+    buf.seek(0)
+    plt.close(fig)
+    return buf
 # Hero Header
 st.markdown("""
 <div class="hero-container">
     <div class="hero-title">🛡️ AI Phishing Shield</div>
     <div class="hero-subtitle">Advanced Machine Learning Protection Against Email Threats</div>
     <div class="hero-description">
+        Powered by TF-IDF vectorization and Logistic Regression, trained on thousands of real-world phishing examples.
         Get instant threat analysis with confidence scoring and explainable AI insights.
     </div>
     <div class="hero-badge">⚡ Developed by Umaima Qureshi</div>
         ]
     })
+# Validate dataset
+required_columns = 2
+if len(df.columns) < required_columns or len(df) == 0:
+    st.error("⚠️ Invalid dataset format. Please ensure your CSV has email text and labels.")
+    st.stop()
 # Clean & Prepare Dataset
 if "Unnamed: 0" in df.columns:
     df = df.drop(columns=["Unnamed: 0"])
 # Model Training
 @st.cache_resource
 def train_model(processed_texts, labels, test_size=0.2, random_state=42):
+    """Enhanced model training with better parameters"""
     unique_labels, counts = np.unique(labels, return_counts=True)
     min_samples = counts.min()
                 processed_texts, labels, test_size=test_size, random_state=random_state, stratify=None
             )
+    # Enhanced TF-IDF with better parameters for phishing detection
+    vectorizer = TfidfVectorizer(
+        max_features=5000,
+        ngram_range=(1,3),  # Include trigrams for better context
+        min_df=1,
+        max_df=0.95,
+        sublinear_tf=True
+    )
     X_train_vec = vectorizer.fit_transform(X_train)
     X_test_vec = vectorizer.transform(X_test)
+    # Use balanced class weights for better phishing detection
+    model = LogisticRegression(
+        max_iter=2000,
+        solver='liblinear',
+        class_weight='balanced',  # Handle imbalanced data better
+        C=1.0
+    )
     model.fit(X_train_vec, y_train)
     y_pred = model.predict(X_test_vec)
     acc = accuracy_score(y_test, y_pred)
     cm = confusion_matrix(y_test, y_pred)
     report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
     return {
         "vectorizer": vectorizer,
         "model": model,
         "report": report
     }
+# Train model with session state to prevent re-training
+if not st.session_state.model_trained:
+    model_info = train_model(df['processed_text'].tolist(), df['label'].values)
+    st.session_state.model_info = model_info
+    st.session_state.model_trained = True
+else:
+    model_info = st.session_state.model_info
+vectorizer = model_info["vectorizer"]
+model = model_info["model"]
+accuracy = model_info["accuracy"]
 # Model Performance
 st.markdown('<div class="section-title">🎯 Model Performance</div>', unsafe_allow_html=True)
     </div>
     """, unsafe_allow_html=True)
+# Confusion Matrix Section
 with st.expander("📈 Detailed Metrics & Confusion Matrix"):
     col_matrix, col_report = st.columns([1, 1.5])
     with col_matrix:
+        # Generate confusion matrix plot once
+        if st.session_state.cm_plot_cached is None:
+            st.session_state.cm_plot_cached = generate_confusion_matrix_plot(model_info["confusion_matrix"])
+        st.image(st.session_state.cm_plot_cached, use_column_width=True)
     with col_report:
         st.markdown("**📊 Classification Report:**")
         report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
         st.dataframe(
+            report_df,
+            use_container_width=True,
             height=250
         )
             email_input = uploaded_txt.read().decode("utf-8", errors="ignore")
         except Exception:
             email_input = str(uploaded_txt.getvalue())
     if st.button("🔍 Analyze Email Threat"):
         if not email_input.strip():
             st.warning("⚠️ Please paste or upload email content to analyze")
         else:
+            # Sanitize input
+            email_input = sanitize_input(email_input)
+            # Validate input
+            is_valid, error_msg = validate_email_input(email_input)
+            if not is_valid:
+                st.warning(f"⚠️ {error_msg}")
+            else:
+                with st.spinner("🔍 Analyzing email threat..."):
                     try:
+                        # ML Model prediction
+                        processed_input = preprocess_text_cached(email_input)
+                        input_vec = vectorizer.transform([processed_input])
+                        try:
+                            ml_proba = model.predict_proba(input_vec)[0][1]
+                        except AttributeError:
+                            decision = model.decision_function(input_vec)[0]
+                            ml_proba = 1 / (1 + np.exp(-decision))
+                        ml_pred = model.predict(input_vec)[0]
+                        # Rule-based scoring
+                        rule_score = calculate_phishing_score(email_input)
+                        # Hybrid approach: weighted combination
+                        # 60% ML model + 40% rule-based (adjustable)
+                        hybrid_proba = (0.6 * ml_proba) + (0.4 * rule_score)
+                        # Final prediction based on hybrid score
+                        final_pred = 1 if hybrid_proba > 0.5 else 0
+                        # Dynamic color based on confidence
+                        if hybrid_proba >= 0.8:
+                            alert_color = "#dc2626"  # Deep red - Critical
+                            alert_gradient = "linear-gradient(135deg, #dc2626 0%, #991b1b 100%)"
+                            shadow_color = "220, 38, 38"
+                            emoji = "🚨"
+                            risk_level = "CRITICAL THREAT"
+                        elif hybrid_proba >= 0.6:
+                            alert_color = "#ef4444"  # Red - High risk
+                            alert_gradient = "linear-gradient(135deg, #ef4444 0%, #dc2626 100%)"
+                            shadow_color = "239, 68, 68"
+                            emoji = "⚠️"
+                            risk_level = "HIGH RISK"
+                        elif hybrid_proba >= 0.4:
+                            alert_color = "#f97316"  # Orange - Medium risk
+                            alert_gradient = "linear-gradient(135deg, #f97316 0%, #ea580c 100%)"
+                            shadow_color = "249, 115, 22"
+                            emoji = "⚡"
+                            risk_level = "MEDIUM RISK"
+                        elif hybrid_proba >= 0.2:
+                            alert_color = "#eab308"  # Yellow - Low risk
+                            alert_gradient = "linear-gradient(135deg, #eab308 0%, #ca8a04 100%)"
+                            shadow_color = "234, 179, 8"
+                            emoji = "⚠️"
+                            risk_level = "LOW RISK"
+                        else:
+                            alert_color = "#10b981"  # Green - Safe
+                            alert_gradient = "linear-gradient(135deg, #10b981 0%, #059669 100%)"
+                            shadow_color = "16, 185, 129"
+                            emoji = "✅"
+                            risk_level = "SAFE"
+                        if final_pred == 1:
+                            conf_pct = f"{hybrid_proba:.1%}"
+                            st.markdown(f"""
+                            <div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
+                                <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
+                                    <div style="font-size: 2.5rem;">{emoji}</div>
+                                    <div>
+                                        <div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">{risk_level} DETECTED</div>
+                                        <div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Threat Confidence: {conf_pct}</div>
+                                        <div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {ml_proba:.1%} | Rule Score: {rule_score:.1%}</div>
+                                    </div>
+                                </div>
+                                <div class="confidence-bar">
+                                    <div class="confidence-fill" style="width: {hybrid_proba*100}%;"></div>
+                                </div>
                             </div>
+                            """, unsafe_allow_html=True)
+                            st.markdown("**🔍 Threat Indicators Detected:**")
+                            indicators = []
+                            if "suspiciousurl" in processed_input or re.search(r'http\S+|www\S+', email_input, re.IGNORECASE):
+                                indicators.append("🔗 Suspicious URL tokens detected")
+                            if re.search(r'\b(urgent|immediately|verify|password|suspended|click|act now|action required)\b', email_input, re.IGNORECASE):
+                                indicators.append("⚡ Urgency manipulation tactics")
+                            if re.search(r'\b(bank|account|verify|login|password|security|credential|paypal)\b', email_input, re.IGNORECASE):
+                                indicators.append("🏦 Financial/security keywords present")
+                            if re.search(r'\b(winner|prize|congratulations|claim|free|won)\b', email_input, re.IGNORECASE):
+                                indicators.append("🎁 Reward/prize baiting language")
+                            if re.search(r'\b(confirm|update|validate|unlock|restore)\b', email_input, re.IGNORECASE):
+                                indicators.append("🔐 Account action requests")
+                            if "cardnumber" in processed_input:
+                                indicators.append("💳 Credit card pattern detected")
+                            if "moneymention" in processed_input:
+                                indicators.append("💰 Money amount mentioned")
+                            for indicator in indicators:
+                                st.markdown(f"- {indicator}")
+                            if not indicators:
+                                st.markdown("- ⚠️ Content pattern matches known phishing templates")
+                            st.error("🚨 **Recommendation:** Do NOT click any links. Delete this email immediately and report to your IT security team.")
+                            # Download analysis report
+                            result_data = {
+                                'timestamp': pd.Timestamp.now(),
+                                'prediction': 'Phishing',
+                                'confidence': f"{hybrid_proba:.2%}",
+                                'ml_score': f"{ml_proba:.2%}",
+                                'rule_score': f"{rule_score:.2%}",
+                                'risk_level': risk_level,
+                                'email_preview': email_input[:100] + "..."
+                            }
+                            result_df = pd.DataFrame([result_data])
+                            csv = result_df.to_csv(index=False)
+                            st.download_button(
+                                label="📥 Download Analysis Report",
+                                data=csv,
+                                file_name=f"phishing_analysis_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                                mime="text/csv"
+                            )
+                        else:
+                            conf_pct = f"{(1-hybrid_proba):.1%}"
+                            st.markdown(f"""
+                            <div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
+                                <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
+                                    <div style="font-size: 2.5rem;">{emoji}</div>
+                                    <div>
+                                        <div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">EMAIL APPEARS SAFE</div>
+                                        <div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Safety Confidence: {conf_pct}</div>
+                                        <div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {(1-ml_proba):.1%} | Rule Score: {(1-rule_score):.1%}</div>
+                                    </div>
+                                </div>
+                                <div class="confidence-bar">
+                                    <div class="confidence-fill" style="width: {(1-hybrid_proba)*100}%;"></div>
+                                </div>
                             </div>
+                            """, unsafe_allow_html=True)
+                            st.markdown("**✓ No obvious threat indicators found in content analysis**")
+                            st.info("💡 **Best Practice:** Always verify sender identity through known contact methods and be cautious with unexpected emails, even if they appear safe.")
+                        # Add to history
+                        st.session_state.analysis_history.append({
+                            'timestamp': pd.Timestamp.now(),
+                            'result': 'Phishing' if final_pred == 1 else 'Safe',
+                            'confidence': f"{hybrid_proba:.2%}",
+                            'preview': email_input[:50] + "..."
+                        })
+                    except Exception as e:
+                        st.error(f"⚠️ Analysis failed: {str(e)}")
 with col_hints:
     st.markdown("""
             <div><strong>Prize/reward</strong> language is a common phishing tactic</div>
         </div>
+        <div class="hint-item">
+            <div class="hint-icon">⚡</div>
+            <div><strong>Hybrid Detection:</strong> Combines ML model (60%) with rule-based scoring (40%)</div>
+        </div>
         <div class="hint-item">
             <div class="hint-icon">⚠️</div>
             <div><strong>Limitations:</strong> This tool analyzes text content only. Always verify sender identity separately.</div>
     </div>
     """, unsafe_allow_html=True)
+# Recent Analyses History
+if len(st.session_state.analysis_history) > 0:
+    st.markdown('<div class="section-title">📊 Recent Analyses</div>', unsafe_allow_html=True)
+    with st.expander("View Recent Analysis History", expanded=False):
+        hist_df = pd.DataFrame(st.session_state.analysis_history[-10:])  # Show last 10
+        hist_df = hist_df.iloc[::-1]  # Reverse to show most recent first
+        st.dataframe(hist_df, use_container_width=True, height=300)
+        if st.button("🗑️ Clear History"):
+            st.session_state.analysis_history = []
+            st.rerun()
 # Additional Tips Section
 st.markdown('<div class="section-title">💡 Phishing Protection Tips</div>', unsafe_allow_html=True)
 with col_tip1:
     st.markdown("""
+    <div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
+                padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
                 box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
         <div style="font-size: 2rem; margin-bottom: 0.75rem;">🔍</div>
         <div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Verify Sender</div>
 with col_tip2:
     st.markdown("""
+    <div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
+                padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
                 box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
         <div style="font-size: 2rem; margin-bottom: 0.75rem;">🔗</div>
         <div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Hover Links</div>
 with col_tip3:
     st.markdown("""
+    <div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
+                padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
                 box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
         <div style="font-size: 2rem; margin-bottom: 0.75rem;">📞</div>
         <div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Contact Directly</div>
         For production use: Implement additional verification layers, link scanning, attachment analysis, and human oversight
     </div>
     <div style="margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid rgba(218,165,32,0.2); font-size: 0.9rem; color: #6b7280;">
+        Powered by TF-IDF • Logistic Regression • Hybrid Detection • Scikit-learn • Streamlit
     </div>
     <div style="margin-top: 1rem; font-size: 0.85rem; color: #6b7280;">
         © 2024 AI Phishing Shield | All Rights Reserved