File size: 11,116 Bytes
aa03395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cecd6ed
aa03395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cecd6ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import gradio as gr
from transformers import pipeline
import re

print("Loading model...")
classifier = pipeline(
    "text-classification",
    model="Umranz/modernbert-enron-spam",
    tokenizer="Umranz/modernbert-enron-spam"
)
print("βœ… Model loaded!")
SPAM_KEYWORDS = [
    "free", "win", "winner", "won", "prize", "claim", "urgent",
    "congratulations", "offer", "limited", "act now", "click here",
    "buy now", "cheap", "discount", "guaranteed", "money", "cash",
    "reward", "selected", "exclusive", "deal", "risk free", "no cost",
    "bonus", "gift", "million", "billion", "dollar", "$", "€", "Β£",
    "100%", "password", "verify", "account", "suspended", "compromised",
    "medication", "pharmacy", "prescription", "weight loss", "earn",
    "income", "investment", "crypto", "bitcoin", "profit"
]

def highlight_keywords(text):
    found = []
    text_lower = text.lower()
    for kw in SPAM_KEYWORDS:
        if kw in text_lower:
            found.append(f"`{kw}`")
    return found

def predict_single(subject, body, threshold):
    if not body.strip():
        return "⚠️ Please enter email body.", "", "", ""

    combined = f"Subject: {subject}\n\n{body}" if subject.strip() else body
    result   = classifier(combined, truncation=True, max_length=512)[0]
    label    = result["label"]
    score    = result["score"] * 100

    if score < threshold:
        verdict = (
            f"## ⚠️ Low Confidence ({score:.1f}%)\n\n"
            f"Best guess: **{label}** but below your threshold of {threshold}%.\n\n"
            f"Try adding more email content for better accuracy."
        )
    elif label == "SPAM":
        bar     = "β–ˆ" * int(score // 5) + "β–‘" * (20 - int(score // 5))
        verdict = (
            f"## 🚨 SPAM DETECTED\n\n"
            f"**Confidence:** {score:.1f}%\n\n"
            f"{bar} {score:.1f}%\n\n"
            f"⚠️ This email shows strong spam characteristics.\n\n"
            f"**Recommendation:** Do not click any links or reply."
        )
    else:
        bar     = "β–ˆ" * int(score // 5) + "β–‘" * (20 - int(score // 5))
        verdict = (
            f"## βœ… LEGITIMATE EMAIL (HAM)\n\n"
            f"**Confidence:** {score:.1f}%\n\n"
            f"{bar} {score:.1f}%\n\n"
            f"This email appears to be legitimate."
        )

    found_keywords = highlight_keywords(f"{subject} {body}")
    if found_keywords:
        keyword_analysis = (
            f"## πŸ” Spam Keywords Found ({len(found_keywords)})\n\n"
            f"{', '.join(found_keywords)}\n\n"
            f"{'⚠️ High keyword density β€” be cautious!' if len(found_keywords) > 5 else '⚑ Low keyword density.'}"
        )
    else:
        keyword_analysis = "## πŸ” Spam Keywords\n\nβœ… No common spam keywords detected."

    risk_level = (
        "πŸ”΄ HIGH RISK"   if label == "SPAM" and score > 80 else
        "🟠 MEDIUM RISK" if label == "SPAM" and score > 50 else
        "🟑 LOW RISK"    if label == "SPAM" else
        "🟒 SAFE"
    )
    risk_summary = (
        f"## πŸ›‘οΈ Risk Assessment\n\n"
        f"**Risk Level:** {risk_level}\n\n"
        f"**Spam Score:** {score:.1f}%\n\n"
        f"**Keywords Found:** {len(found_keywords)}\n\n"
        f"**Model Verdict:** {label}"
    )

    if label == "SPAM":
        tips = (
            f"## πŸ’‘ Safety Tips\n\n"
            f"- 🚫 Don't click any links in this email\n"
            f"- 🚫 Don't reply or provide personal info\n"
            f"- 🚫 Don't download any attachments\n"
            f"- βœ… Mark as spam in your email client\n"
            f"- βœ… Block the sender"
        )
    else:
        tips = (
            f"## πŸ’‘ Tips\n\n"
            f"- βœ… Email appears safe to read\n"
            f"- βœ… Always verify sender identity\n"
            f"- βœ… Be cautious with attachments\n"
            f"- ⚑ Even legitimate emails can have phishing links"
        )

    return verdict, keyword_analysis, risk_summary, tips

def predict_batch(emails_text, threshold):
    if not emails_text.strip():
        return "⚠️ Please enter at least one email."

    lines   = [l.strip() for l in emails_text.strip().split("\n") if l.strip()]
    results = classifier(lines, truncation=True, max_length=512)

    spam_count = 0
    ham_count  = 0
    output     = "## πŸ“‹ Batch Analysis Results\n\n"

    for line, res in zip(lines, results):
        label = res["label"]
        score = res["score"] * 100
        emoji = "🚨" if label == "SPAM" else "βœ…"
        bar   = "β–ˆ" * int(score // 10) + "β–‘" * (10 - int(score // 10))

        if label == "SPAM":
            spam_count += 1
        else:
            ham_count += 1

        output += (
            f"{emoji} **{label}** ({score:.1f}%) {bar}\n"
            f"> {line[:80]}{'...' if len(line) > 80 else ''}\n\n"
        )

    total   = len(lines)
    output += (
        f"---\n"
        f"## πŸ“Š Summary\n\n"
        f"- **Total:** {total} emails\n"
        f"- **🚨 Spam:** {spam_count} ({spam_count/total*100:.1f}%)\n"
        f"- **βœ… Ham:** {ham_count} ({ham_count/total*100:.1f}%)"
    )
    return output

with gr.Blocks(title="SpamShield AI") as demo:

    gr.Markdown("""
    # πŸ›‘οΈ SpamShield AI β€” Email Spam Classifier
    **ModernBERT (2024) fine-tuned on 31K Enron Emails**
    > Paste any email to instantly detect if it's spam or legitimate.
    """)

    with gr.Tabs():

        with gr.Tab("πŸ“§ Single Email"):
            with gr.Row():
                with gr.Column(scale=2):
                    subject_input = gr.Textbox(
                        lines=1,
                        placeholder="e.g. You Won $1,000,000!!!",
                        label="πŸ“Œ Email Subject"
                    )
                    body_input = gr.Textbox(
                        lines=8,
                        placeholder="Paste the email body here...",
                        label="πŸ“ Email Body"
                    )
                    threshold_slider = gr.Slider(
                        minimum=10,
                        maximum=90,
                        value=50,
                        step=5,
                        label="βš™οΈ Confidence Threshold (%)",
                        info="Predictions below this % show a low-confidence warning"
                    )
                    analyze_btn = gr.Button(
                        "πŸ” Analyze Email",
                        variant="primary",
                        size="lg"
                    )

                with gr.Column(scale=3):
                    verdict_output   = gr.Markdown(label="Verdict")
                    keyword_output   = gr.Markdown(label="Keyword Analysis")

            with gr.Row():
                risk_output = gr.Markdown(label="Risk Assessment")
                tips_output = gr.Markdown(label="Safety Tips")

            gr.Examples(
                examples=[
                    ["You Won $1,000,000!!!",  "Congratulations! You have been selected as our lucky winner. Click here to claim your prize now. Limited time offer!", 50],
                    ["Meeting Notes",           "Hi team, please find attached the meeting notes from yesterday's standup. Let me know if I missed anything.", 50],
                    ["URGENT: Verify Account",  "Your account has been compromised. Verify your credentials immediately to avoid suspension. Click the link below.", 50],
                    ["Quick Question",          "Hey, are you free for a call tomorrow at 3pm? I wanted to discuss the project timeline.", 50],
                    ["Exclusive Pharma Deal",   "Buy cheap medications online! No prescription needed! 90% discount. Act now before offer expires!", 50],
                ],
                inputs=[subject_input, body_input, threshold_slider],
                label="πŸ’‘ Try These Examples"
            )

            analyze_btn.click(
                fn=predict_single,
                inputs=[subject_input, body_input, threshold_slider],
                outputs=[verdict_output, keyword_output, risk_output, tips_output]
            )
        with gr.Tab("πŸ“‹ Batch Emails"):
            gr.Markdown("Enter one email per line for bulk analysis.")
            batch_input = gr.Textbox(
                lines=10,
                placeholder=(
                    "Congratulations you won $1M click here now!\n"
                    "Hi John, meeting is rescheduled to 3pm.\n"
                    "Buy cheap meds online no prescription needed!\n"
                    "Please review the attached quarterly report."
                ),
                label="πŸ“ Emails (one per line)"
            )
            batch_threshold = gr.Slider(
                minimum=10,
                maximum=90,
                value=50,
                step=5,
                label="βš™οΈ Confidence Threshold (%)"
            )
            batch_btn    = gr.Button("πŸ” Analyze All", variant="primary", size="lg")
            batch_output = gr.Markdown(label="Batch Results")

            gr.Examples(
                examples=[[
                    "Congratulations you won $1M click here now!\n"
                    "Hi John, meeting is rescheduled to 3pm.\n"
                    "Buy cheap meds online no prescription needed!\n"
                    "Please review the attached quarterly report.\n"
                    "URGENT your account will be suspended verify now!\n"
                    "Are you free for lunch this Friday?"
                ]],
                inputs=[batch_input],
                label="πŸ’‘ Example"
            )

            batch_btn.click(
                fn=predict_batch,
                inputs=[batch_input, batch_threshold],
                outputs=[batch_output]
            )

        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
            ## πŸ›‘οΈ SpamShield AI

            ### 🧠 Model
            - **Architecture:** ModernBERT (2024) β€” state of the art
            - **Fine-tuned on:** Enron Spam Dataset (31,716 emails)
            - **Input:** Subject + Body combined for maximum context
            - **Context window:** 512 tokens

            ### πŸ“Š Performance
            | Metric | Score |
            |---|---|
            | Accuracy | ~98% |
            | F1 Score | ~98% |
            | Precision | ~98% |
            | Recall | ~98% |

            ### πŸ” Features
            - βœ… Single email analysis with risk assessment
            - βœ… Spam keyword highlighting
            - βœ… Confidence threshold control
            - βœ… Batch analysis for multiple emails
            - βœ… Safety tips for spam emails

            ### πŸ“§ Dataset
            - **Enron Email Dataset** β€” Real corporate emails
            - 31,716 training samples
            - Industry standard benchmark

            ### πŸ‘¨β€πŸ’» Built By
            Umranz β€” [HuggingFace Profile](https://huggingface.co/Umranz)

            > ⚠️ This tool is for educational purposes.
            > Always use proper email security tools in production.
            """)

demo.launch(theme=gr.themes.Soft())