Spaces:
Sleeping
Sleeping
File size: 11,116 Bytes
aa03395 cecd6ed aa03395 cecd6ed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | import gradio as gr
from transformers import pipeline
import re
print("Loading model...")
classifier = pipeline(
"text-classification",
model="Umranz/modernbert-enron-spam",
tokenizer="Umranz/modernbert-enron-spam"
)
print("β
Model loaded!")
SPAM_KEYWORDS = [
"free", "win", "winner", "won", "prize", "claim", "urgent",
"congratulations", "offer", "limited", "act now", "click here",
"buy now", "cheap", "discount", "guaranteed", "money", "cash",
"reward", "selected", "exclusive", "deal", "risk free", "no cost",
"bonus", "gift", "million", "billion", "dollar", "$", "β¬", "Β£",
"100%", "password", "verify", "account", "suspended", "compromised",
"medication", "pharmacy", "prescription", "weight loss", "earn",
"income", "investment", "crypto", "bitcoin", "profit"
]
def highlight_keywords(text):
found = []
text_lower = text.lower()
for kw in SPAM_KEYWORDS:
if kw in text_lower:
found.append(f"`{kw}`")
return found
def predict_single(subject, body, threshold):
if not body.strip():
return "β οΈ Please enter email body.", "", "", ""
combined = f"Subject: {subject}\n\n{body}" if subject.strip() else body
result = classifier(combined, truncation=True, max_length=512)[0]
label = result["label"]
score = result["score"] * 100
if score < threshold:
verdict = (
f"## β οΈ Low Confidence ({score:.1f}%)\n\n"
f"Best guess: **{label}** but below your threshold of {threshold}%.\n\n"
f"Try adding more email content for better accuracy."
)
elif label == "SPAM":
bar = "β" * int(score // 5) + "β" * (20 - int(score // 5))
verdict = (
f"## π¨ SPAM DETECTED\n\n"
f"**Confidence:** {score:.1f}%\n\n"
f"{bar} {score:.1f}%\n\n"
f"β οΈ This email shows strong spam characteristics.\n\n"
f"**Recommendation:** Do not click any links or reply."
)
else:
bar = "β" * int(score // 5) + "β" * (20 - int(score // 5))
verdict = (
f"## β
LEGITIMATE EMAIL (HAM)\n\n"
f"**Confidence:** {score:.1f}%\n\n"
f"{bar} {score:.1f}%\n\n"
f"This email appears to be legitimate."
)
found_keywords = highlight_keywords(f"{subject} {body}")
if found_keywords:
keyword_analysis = (
f"## π Spam Keywords Found ({len(found_keywords)})\n\n"
f"{', '.join(found_keywords)}\n\n"
f"{'β οΈ High keyword density β be cautious!' if len(found_keywords) > 5 else 'β‘ Low keyword density.'}"
)
else:
keyword_analysis = "## π Spam Keywords\n\nβ
No common spam keywords detected."
risk_level = (
"π΄ HIGH RISK" if label == "SPAM" and score > 80 else
"π MEDIUM RISK" if label == "SPAM" and score > 50 else
"π‘ LOW RISK" if label == "SPAM" else
"π’ SAFE"
)
risk_summary = (
f"## π‘οΈ Risk Assessment\n\n"
f"**Risk Level:** {risk_level}\n\n"
f"**Spam Score:** {score:.1f}%\n\n"
f"**Keywords Found:** {len(found_keywords)}\n\n"
f"**Model Verdict:** {label}"
)
if label == "SPAM":
tips = (
f"## π‘ Safety Tips\n\n"
f"- π« Don't click any links in this email\n"
f"- π« Don't reply or provide personal info\n"
f"- π« Don't download any attachments\n"
f"- β
Mark as spam in your email client\n"
f"- β
Block the sender"
)
else:
tips = (
f"## π‘ Tips\n\n"
f"- β
Email appears safe to read\n"
f"- β
Always verify sender identity\n"
f"- β
Be cautious with attachments\n"
f"- β‘ Even legitimate emails can have phishing links"
)
return verdict, keyword_analysis, risk_summary, tips
def predict_batch(emails_text, threshold):
if not emails_text.strip():
return "β οΈ Please enter at least one email."
lines = [l.strip() for l in emails_text.strip().split("\n") if l.strip()]
results = classifier(lines, truncation=True, max_length=512)
spam_count = 0
ham_count = 0
output = "## π Batch Analysis Results\n\n"
for line, res in zip(lines, results):
label = res["label"]
score = res["score"] * 100
emoji = "π¨" if label == "SPAM" else "β
"
bar = "β" * int(score // 10) + "β" * (10 - int(score // 10))
if label == "SPAM":
spam_count += 1
else:
ham_count += 1
output += (
f"{emoji} **{label}** ({score:.1f}%) {bar}\n"
f"> {line[:80]}{'...' if len(line) > 80 else ''}\n\n"
)
total = len(lines)
output += (
f"---\n"
f"## π Summary\n\n"
f"- **Total:** {total} emails\n"
f"- **π¨ Spam:** {spam_count} ({spam_count/total*100:.1f}%)\n"
f"- **β
Ham:** {ham_count} ({ham_count/total*100:.1f}%)"
)
return output
with gr.Blocks(title="SpamShield AI") as demo:
gr.Markdown("""
# π‘οΈ SpamShield AI β Email Spam Classifier
**ModernBERT (2024) fine-tuned on 31K Enron Emails**
> Paste any email to instantly detect if it's spam or legitimate.
""")
with gr.Tabs():
with gr.Tab("π§ Single Email"):
with gr.Row():
with gr.Column(scale=2):
subject_input = gr.Textbox(
lines=1,
placeholder="e.g. You Won $1,000,000!!!",
label="π Email Subject"
)
body_input = gr.Textbox(
lines=8,
placeholder="Paste the email body here...",
label="π Email Body"
)
threshold_slider = gr.Slider(
minimum=10,
maximum=90,
value=50,
step=5,
label="βοΈ Confidence Threshold (%)",
info="Predictions below this % show a low-confidence warning"
)
analyze_btn = gr.Button(
"π Analyze Email",
variant="primary",
size="lg"
)
with gr.Column(scale=3):
verdict_output = gr.Markdown(label="Verdict")
keyword_output = gr.Markdown(label="Keyword Analysis")
with gr.Row():
risk_output = gr.Markdown(label="Risk Assessment")
tips_output = gr.Markdown(label="Safety Tips")
gr.Examples(
examples=[
["You Won $1,000,000!!!", "Congratulations! You have been selected as our lucky winner. Click here to claim your prize now. Limited time offer!", 50],
["Meeting Notes", "Hi team, please find attached the meeting notes from yesterday's standup. Let me know if I missed anything.", 50],
["URGENT: Verify Account", "Your account has been compromised. Verify your credentials immediately to avoid suspension. Click the link below.", 50],
["Quick Question", "Hey, are you free for a call tomorrow at 3pm? I wanted to discuss the project timeline.", 50],
["Exclusive Pharma Deal", "Buy cheap medications online! No prescription needed! 90% discount. Act now before offer expires!", 50],
],
inputs=[subject_input, body_input, threshold_slider],
label="π‘ Try These Examples"
)
analyze_btn.click(
fn=predict_single,
inputs=[subject_input, body_input, threshold_slider],
outputs=[verdict_output, keyword_output, risk_output, tips_output]
)
with gr.Tab("π Batch Emails"):
gr.Markdown("Enter one email per line for bulk analysis.")
batch_input = gr.Textbox(
lines=10,
placeholder=(
"Congratulations you won $1M click here now!\n"
"Hi John, meeting is rescheduled to 3pm.\n"
"Buy cheap meds online no prescription needed!\n"
"Please review the attached quarterly report."
),
label="π Emails (one per line)"
)
batch_threshold = gr.Slider(
minimum=10,
maximum=90,
value=50,
step=5,
label="βοΈ Confidence Threshold (%)"
)
batch_btn = gr.Button("π Analyze All", variant="primary", size="lg")
batch_output = gr.Markdown(label="Batch Results")
gr.Examples(
examples=[[
"Congratulations you won $1M click here now!\n"
"Hi John, meeting is rescheduled to 3pm.\n"
"Buy cheap meds online no prescription needed!\n"
"Please review the attached quarterly report.\n"
"URGENT your account will be suspended verify now!\n"
"Are you free for lunch this Friday?"
]],
inputs=[batch_input],
label="π‘ Example"
)
batch_btn.click(
fn=predict_batch,
inputs=[batch_input, batch_threshold],
outputs=[batch_output]
)
with gr.Tab("βΉοΈ About"):
gr.Markdown("""
## π‘οΈ SpamShield AI
### π§ Model
- **Architecture:** ModernBERT (2024) β state of the art
- **Fine-tuned on:** Enron Spam Dataset (31,716 emails)
- **Input:** Subject + Body combined for maximum context
- **Context window:** 512 tokens
### π Performance
| Metric | Score |
|---|---|
| Accuracy | ~98% |
| F1 Score | ~98% |
| Precision | ~98% |
| Recall | ~98% |
### π Features
- β
Single email analysis with risk assessment
- β
Spam keyword highlighting
- β
Confidence threshold control
- β
Batch analysis for multiple emails
- β
Safety tips for spam emails
### π§ Dataset
- **Enron Email Dataset** β Real corporate emails
- 31,716 training samples
- Industry standard benchmark
### π¨βπ» Built By
Umranz β [HuggingFace Profile](https://huggingface.co/Umranz)
> β οΈ This tool is for educational purposes.
> Always use proper email security tools in production.
""")
demo.launch(theme=gr.themes.Soft()) |