Spaces:

Umranz
/

SpamShield-AI

Sleeping

File size: 11,116 Bytes

import gradio as gr
from transformers import pipeline
import re

print("Loading model...")
classifier = pipeline(
    "text-classification",
    model="Umranz/modernbert-enron-spam",
    tokenizer="Umranz/modernbert-enron-spam"
)
print("✅ Model loaded!")
SPAM_KEYWORDS = [
    "free", "win", "winner", "won", "prize", "claim", "urgent",
    "congratulations", "offer", "limited", "act now", "click here",
    "buy now", "cheap", "discount", "guaranteed", "money", "cash",
    "reward", "selected", "exclusive", "deal", "risk free", "no cost",
    "bonus", "gift", "million", "billion", "dollar", "$", "€", "£",
    "100%", "password", "verify", "account", "suspended", "compromised",
    "medication", "pharmacy", "prescription", "weight loss", "earn",
    "income", "investment", "crypto", "bitcoin", "profit"
]

def highlight_keywords(text):
    found = []
    text_lower = text.lower()
    for kw in SPAM_KEYWORDS:
        if kw in text_lower:
            found.append(f"`{kw}`")
    return found

def predict_single(subject, body, threshold):
    if not body.strip():
        return "⚠️ Please enter email body.", "", "", ""

    combined = f"Subject: {subject}\n\n{body}" if subject.strip() else body
    result   = classifier(combined, truncation=True, max_length=512)[0]
    label    = result["label"]
    score    = result["score"] * 100

    if score < threshold:
        verdict = (
            f"## ⚠️ Low Confidence ({score:.1f}%)\n\n"
            f"Best guess: **{label}** but below your threshold of {threshold}%.\n\n"
            f"Try adding more email content for better accuracy."
        )
    elif label == "SPAM":
        bar     = "█" * int(score // 5) + "░" * (20 - int(score // 5))
        verdict = (
            f"## 🚨 SPAM DETECTED\n\n"
            f"**Confidence:** {score:.1f}%\n\n"
            f"{bar} {score:.1f}%\n\n"
            f"⚠️ This email shows strong spam characteristics.\n\n"
            f"**Recommendation:** Do not click any links or reply."
        )
    else:
        bar     = "█" * int(score // 5) + "░" * (20 - int(score // 5))
        verdict = (
            f"## ✅ LEGITIMATE EMAIL (HAM)\n\n"
            f"**Confidence:** {score:.1f}%\n\n"
            f"{bar} {score:.1f}%\n\n"
            f"This email appears to be legitimate."
        )

    found_keywords = highlight_keywords(f"{subject} {body}")
    if found_keywords:
        keyword_analysis = (
            f"## 🔍 Spam Keywords Found ({len(found_keywords)})\n\n"
            f"{', '.join(found_keywords)}\n\n"
            f"{'⚠️ High keyword density — be cautious!' if len(found_keywords) > 5 else '⚡ Low keyword density.'}"
        )
    else:
        keyword_analysis = "## 🔍 Spam Keywords\n\n✅ No common spam keywords detected."

    risk_level = (
        "🔴 HIGH RISK"   if label == "SPAM" and score > 80 else
        "🟠 MEDIUM RISK" if label == "SPAM" and score > 50 else
        "🟡 LOW RISK"    if label == "SPAM" else
        "🟢 SAFE"
    )
    risk_summary = (
        f"## 🛡️ Risk Assessment\n\n"
        f"**Risk Level:** {risk_level}\n\n"
        f"**Spam Score:** {score:.1f}%\n\n"
        f"**Keywords Found:** {len(found_keywords)}\n\n"
        f"**Model Verdict:** {label}"
    )

    if label == "SPAM":
        tips = (
            f"## 💡 Safety Tips\n\n"
            f"- 🚫 Don't click any links in this email\n"
            f"- 🚫 Don't reply or provide personal info\n"
            f"- 🚫 Don't download any attachments\n"
            f"- ✅ Mark as spam in your email client\n"
            f"- ✅ Block the sender"
        )
    else:
        tips = (
            f"## 💡 Tips\n\n"
            f"- ✅ Email appears safe to read\n"
            f"- ✅ Always verify sender identity\n"
            f"- ✅ Be cautious with attachments\n"
            f"- ⚡ Even legitimate emails can have phishing links"
        )

    return verdict, keyword_analysis, risk_summary, tips

def predict_batch(emails_text, threshold):
    if not emails_text.strip():
        return "⚠️ Please enter at least one email."

    lines   = [l.strip() for l in emails_text.strip().split("\n") if l.strip()]
    results = classifier(lines, truncation=True, max_length=512)

    spam_count = 0
    ham_count  = 0
    output     = "## 📋 Batch Analysis Results\n\n"

    for line, res in zip(lines, results):
        label = res["label"]
        score = res["score"] * 100
        emoji = "🚨" if label == "SPAM" else "✅"
        bar   = "█" * int(score // 10) + "░" * (10 - int(score // 10))

        if label == "SPAM":
            spam_count += 1
        else:
            ham_count += 1

        output += (
            f"{emoji} **{label}** ({score:.1f}%) {bar}\n"
            f"> {line[:80]}{'...' if len(line) > 80 else ''}\n\n"
        )

    total   = len(lines)
    output += (
        f"---\n"
        f"## 📊 Summary\n\n"
        f"- **Total:** {total} emails\n"
        f"- **🚨 Spam:** {spam_count} ({spam_count/total*100:.1f}%)\n"
        f"- **✅ Ham:** {ham_count} ({ham_count/total*100:.1f}%)"
    )
    return output

with gr.Blocks(title="SpamShield AI") as demo:

    gr.Markdown("""
    # 🛡️ SpamShield AI — Email Spam Classifier
    **ModernBERT (2024) fine-tuned on 31K Enron Emails**
    > Paste any email to instantly detect if it's spam or legitimate.
    """)

    with gr.Tabs():

        with gr.Tab("📧 Single Email"):
            with gr.Row():
                with gr.Column(scale=2):
                    subject_input = gr.Textbox(
                        lines=1,
                        placeholder="e.g. You Won $1,000,000!!!",
                        label="📌 Email Subject"
                    )
                    body_input = gr.Textbox(
                        lines=8,
                        placeholder="Paste the email body here...",
                        label="📝 Email Body"
                    )
                    threshold_slider = gr.Slider(
                        minimum=10,
                        maximum=90,
                        value=50,
                        step=5,
                        label="⚙️ Confidence Threshold (%)",
                        info="Predictions below this % show a low-confidence warning"
                    )
                    analyze_btn = gr.Button(
                        "🔍 Analyze Email",
                        variant="primary",
                        size="lg"
                    )

                with gr.Column(scale=3):
                    verdict_output   = gr.Markdown(label="Verdict")
                    keyword_output   = gr.Markdown(label="Keyword Analysis")

            with gr.Row():
                risk_output = gr.Markdown(label="Risk Assessment")
                tips_output = gr.Markdown(label="Safety Tips")

            gr.Examples(
                examples=[
                    ["You Won $1,000,000!!!",  "Congratulations! You have been selected as our lucky winner. Click here to claim your prize now. Limited time offer!", 50],
                    ["Meeting Notes",           "Hi team, please find attached the meeting notes from yesterday's standup. Let me know if I missed anything.", 50],
                    ["URGENT: Verify Account",  "Your account has been compromised. Verify your credentials immediately to avoid suspension. Click the link below.", 50],
                    ["Quick Question",          "Hey, are you free for a call tomorrow at 3pm? I wanted to discuss the project timeline.", 50],
                    ["Exclusive Pharma Deal",   "Buy cheap medications online! No prescription needed! 90% discount. Act now before offer expires!", 50],
                ],
                inputs=[subject_input, body_input, threshold_slider],
                label="💡 Try These Examples"
            )

            analyze_btn.click(
                fn=predict_single,
                inputs=[subject_input, body_input, threshold_slider],
                outputs=[verdict_output, keyword_output, risk_output, tips_output]
            )
        with gr.Tab("📋 Batch Emails"):
            gr.Markdown("Enter one email per line for bulk analysis.")
            batch_input = gr.Textbox(
                lines=10,
                placeholder=(
                    "Congratulations you won $1M click here now!\n"
                    "Hi John, meeting is rescheduled to 3pm.\n"
                    "Buy cheap meds online no prescription needed!\n"
                    "Please review the attached quarterly report."
                ),
                label="📝 Emails (one per line)"
            )
            batch_threshold = gr.Slider(
                minimum=10,
                maximum=90,
                value=50,
                step=5,
                label="⚙️ Confidence Threshold (%)"
            )
            batch_btn    = gr.Button("🔍 Analyze All", variant="primary", size="lg")
            batch_output = gr.Markdown(label="Batch Results")

            gr.Examples(
                examples=[[
                    "Congratulations you won $1M click here now!\n"
                    "Hi John, meeting is rescheduled to 3pm.\n"
                    "Buy cheap meds online no prescription needed!\n"
                    "Please review the attached quarterly report.\n"
                    "URGENT your account will be suspended verify now!\n"
                    "Are you free for lunch this Friday?"
                ]],
                inputs=[batch_input],
                label="💡 Example"
            )

            batch_btn.click(
                fn=predict_batch,
                inputs=[batch_input, batch_threshold],
                outputs=[batch_output]
            )

        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
            ## 🛡️ SpamShield AI

            ### 🧠 Model
            - **Architecture:** ModernBERT (2024) — state of the art
            - **Fine-tuned on:** Enron Spam Dataset (31,716 emails)
            - **Input:** Subject + Body combined for maximum context
            - **Context window:** 512 tokens

            ### 📊 Performance
            | Metric | Score |
            |---|---|
            | Accuracy | ~98% |
            | F1 Score | ~98% |
            | Precision | ~98% |
            | Recall | ~98% |

            ### 🔍 Features
            - ✅ Single email analysis with risk assessment
            - ✅ Spam keyword highlighting
            - ✅ Confidence threshold control
            - ✅ Batch analysis for multiple emails
            - ✅ Safety tips for spam emails

            ### 📧 Dataset
            - **Enron Email Dataset** — Real corporate emails
            - 31,716 training samples
            - Industry standard benchmark

            ### 👨‍💻 Built By
            Umranz — [HuggingFace Profile](https://huggingface.co/Umranz)

            > ⚠️ This tool is for educational purposes.
            > Always use proper email security tools in production.
            """)

demo.launch(theme=gr.themes.Soft())