Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline | |
| import re | |
| print("Loading model...") | |
| classifier = pipeline( | |
| "text-classification", | |
| model="Umranz/modernbert-enron-spam", | |
| tokenizer="Umranz/modernbert-enron-spam" | |
| ) | |
| print("β Model loaded!") | |
| SPAM_KEYWORDS = [ | |
| "free", "win", "winner", "won", "prize", "claim", "urgent", | |
| "congratulations", "offer", "limited", "act now", "click here", | |
| "buy now", "cheap", "discount", "guaranteed", "money", "cash", | |
| "reward", "selected", "exclusive", "deal", "risk free", "no cost", | |
| "bonus", "gift", "million", "billion", "dollar", "$", "β¬", "Β£", | |
| "100%", "password", "verify", "account", "suspended", "compromised", | |
| "medication", "pharmacy", "prescription", "weight loss", "earn", | |
| "income", "investment", "crypto", "bitcoin", "profit" | |
| ] | |
| def highlight_keywords(text): | |
| found = [] | |
| text_lower = text.lower() | |
| for kw in SPAM_KEYWORDS: | |
| if kw in text_lower: | |
| found.append(f"`{kw}`") | |
| return found | |
| def predict_single(subject, body, threshold): | |
| if not body.strip(): | |
| return "β οΈ Please enter email body.", "", "", "" | |
| combined = f"Subject: {subject}\n\n{body}" if subject.strip() else body | |
| result = classifier(combined, truncation=True, max_length=512)[0] | |
| label = result["label"] | |
| score = result["score"] * 100 | |
| if score < threshold: | |
| verdict = ( | |
| f"## β οΈ Low Confidence ({score:.1f}%)\n\n" | |
| f"Best guess: **{label}** but below your threshold of {threshold}%.\n\n" | |
| f"Try adding more email content for better accuracy." | |
| ) | |
| elif label == "SPAM": | |
| bar = "β" * int(score // 5) + "β" * (20 - int(score // 5)) | |
| verdict = ( | |
| f"## π¨ SPAM DETECTED\n\n" | |
| f"**Confidence:** {score:.1f}%\n\n" | |
| f"{bar} {score:.1f}%\n\n" | |
| f"β οΈ This email shows strong spam characteristics.\n\n" | |
| f"**Recommendation:** Do not click any links or reply." | |
| ) | |
| else: | |
| bar = "β" * int(score // 5) + "β" * (20 - int(score // 5)) | |
| verdict = ( | |
| f"## β LEGITIMATE EMAIL (HAM)\n\n" | |
| f"**Confidence:** {score:.1f}%\n\n" | |
| f"{bar} {score:.1f}%\n\n" | |
| f"This email appears to be legitimate." | |
| ) | |
| found_keywords = highlight_keywords(f"{subject} {body}") | |
| if found_keywords: | |
| keyword_analysis = ( | |
| f"## π Spam Keywords Found ({len(found_keywords)})\n\n" | |
| f"{', '.join(found_keywords)}\n\n" | |
| f"{'β οΈ High keyword density β be cautious!' if len(found_keywords) > 5 else 'β‘ Low keyword density.'}" | |
| ) | |
| else: | |
| keyword_analysis = "## π Spam Keywords\n\nβ No common spam keywords detected." | |
| risk_level = ( | |
| "π΄ HIGH RISK" if label == "SPAM" and score > 80 else | |
| "π MEDIUM RISK" if label == "SPAM" and score > 50 else | |
| "π‘ LOW RISK" if label == "SPAM" else | |
| "π’ SAFE" | |
| ) | |
| risk_summary = ( | |
| f"## π‘οΈ Risk Assessment\n\n" | |
| f"**Risk Level:** {risk_level}\n\n" | |
| f"**Spam Score:** {score:.1f}%\n\n" | |
| f"**Keywords Found:** {len(found_keywords)}\n\n" | |
| f"**Model Verdict:** {label}" | |
| ) | |
| if label == "SPAM": | |
| tips = ( | |
| f"## π‘ Safety Tips\n\n" | |
| f"- π« Don't click any links in this email\n" | |
| f"- π« Don't reply or provide personal info\n" | |
| f"- π« Don't download any attachments\n" | |
| f"- β Mark as spam in your email client\n" | |
| f"- β Block the sender" | |
| ) | |
| else: | |
| tips = ( | |
| f"## π‘ Tips\n\n" | |
| f"- β Email appears safe to read\n" | |
| f"- β Always verify sender identity\n" | |
| f"- β Be cautious with attachments\n" | |
| f"- β‘ Even legitimate emails can have phishing links" | |
| ) | |
| return verdict, keyword_analysis, risk_summary, tips | |
| def predict_batch(emails_text, threshold): | |
| if not emails_text.strip(): | |
| return "β οΈ Please enter at least one email." | |
| lines = [l.strip() for l in emails_text.strip().split("\n") if l.strip()] | |
| results = classifier(lines, truncation=True, max_length=512) | |
| spam_count = 0 | |
| ham_count = 0 | |
| output = "## π Batch Analysis Results\n\n" | |
| for line, res in zip(lines, results): | |
| label = res["label"] | |
| score = res["score"] * 100 | |
| emoji = "π¨" if label == "SPAM" else "β " | |
| bar = "β" * int(score // 10) + "β" * (10 - int(score // 10)) | |
| if label == "SPAM": | |
| spam_count += 1 | |
| else: | |
| ham_count += 1 | |
| output += ( | |
| f"{emoji} **{label}** ({score:.1f}%) {bar}\n" | |
| f"> {line[:80]}{'...' if len(line) > 80 else ''}\n\n" | |
| ) | |
| total = len(lines) | |
| output += ( | |
| f"---\n" | |
| f"## π Summary\n\n" | |
| f"- **Total:** {total} emails\n" | |
| f"- **π¨ Spam:** {spam_count} ({spam_count/total*100:.1f}%)\n" | |
| f"- **β Ham:** {ham_count} ({ham_count/total*100:.1f}%)" | |
| ) | |
| return output | |
| with gr.Blocks(title="SpamShield AI") as demo: | |
| gr.Markdown(""" | |
| # π‘οΈ SpamShield AI β Email Spam Classifier | |
| **ModernBERT (2024) fine-tuned on 31K Enron Emails** | |
| > Paste any email to instantly detect if it's spam or legitimate. | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab("π§ Single Email"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| subject_input = gr.Textbox( | |
| lines=1, | |
| placeholder="e.g. You Won $1,000,000!!!", | |
| label="π Email Subject" | |
| ) | |
| body_input = gr.Textbox( | |
| lines=8, | |
| placeholder="Paste the email body here...", | |
| label="π Email Body" | |
| ) | |
| threshold_slider = gr.Slider( | |
| minimum=10, | |
| maximum=90, | |
| value=50, | |
| step=5, | |
| label="βοΈ Confidence Threshold (%)", | |
| info="Predictions below this % show a low-confidence warning" | |
| ) | |
| analyze_btn = gr.Button( | |
| "π Analyze Email", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=3): | |
| verdict_output = gr.Markdown(label="Verdict") | |
| keyword_output = gr.Markdown(label="Keyword Analysis") | |
| with gr.Row(): | |
| risk_output = gr.Markdown(label="Risk Assessment") | |
| tips_output = gr.Markdown(label="Safety Tips") | |
| gr.Examples( | |
| examples=[ | |
| ["You Won $1,000,000!!!", "Congratulations! You have been selected as our lucky winner. Click here to claim your prize now. Limited time offer!", 50], | |
| ["Meeting Notes", "Hi team, please find attached the meeting notes from yesterday's standup. Let me know if I missed anything.", 50], | |
| ["URGENT: Verify Account", "Your account has been compromised. Verify your credentials immediately to avoid suspension. Click the link below.", 50], | |
| ["Quick Question", "Hey, are you free for a call tomorrow at 3pm? I wanted to discuss the project timeline.", 50], | |
| ["Exclusive Pharma Deal", "Buy cheap medications online! No prescription needed! 90% discount. Act now before offer expires!", 50], | |
| ], | |
| inputs=[subject_input, body_input, threshold_slider], | |
| label="π‘ Try These Examples" | |
| ) | |
| analyze_btn.click( | |
| fn=predict_single, | |
| inputs=[subject_input, body_input, threshold_slider], | |
| outputs=[verdict_output, keyword_output, risk_output, tips_output] | |
| ) | |
| with gr.Tab("π Batch Emails"): | |
| gr.Markdown("Enter one email per line for bulk analysis.") | |
| batch_input = gr.Textbox( | |
| lines=10, | |
| placeholder=( | |
| "Congratulations you won $1M click here now!\n" | |
| "Hi John, meeting is rescheduled to 3pm.\n" | |
| "Buy cheap meds online no prescription needed!\n" | |
| "Please review the attached quarterly report." | |
| ), | |
| label="π Emails (one per line)" | |
| ) | |
| batch_threshold = gr.Slider( | |
| minimum=10, | |
| maximum=90, | |
| value=50, | |
| step=5, | |
| label="βοΈ Confidence Threshold (%)" | |
| ) | |
| batch_btn = gr.Button("π Analyze All", variant="primary", size="lg") | |
| batch_output = gr.Markdown(label="Batch Results") | |
| gr.Examples( | |
| examples=[[ | |
| "Congratulations you won $1M click here now!\n" | |
| "Hi John, meeting is rescheduled to 3pm.\n" | |
| "Buy cheap meds online no prescription needed!\n" | |
| "Please review the attached quarterly report.\n" | |
| "URGENT your account will be suspended verify now!\n" | |
| "Are you free for lunch this Friday?" | |
| ]], | |
| inputs=[batch_input], | |
| label="π‘ Example" | |
| ) | |
| batch_btn.click( | |
| fn=predict_batch, | |
| inputs=[batch_input, batch_threshold], | |
| outputs=[batch_output] | |
| ) | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ## π‘οΈ SpamShield AI | |
| ### π§ Model | |
| - **Architecture:** ModernBERT (2024) β state of the art | |
| - **Fine-tuned on:** Enron Spam Dataset (31,716 emails) | |
| - **Input:** Subject + Body combined for maximum context | |
| - **Context window:** 512 tokens | |
| ### π Performance | |
| | Metric | Score | | |
| |---|---| | |
| | Accuracy | ~98% | | |
| | F1 Score | ~98% | | |
| | Precision | ~98% | | |
| | Recall | ~98% | | |
| ### π Features | |
| - β Single email analysis with risk assessment | |
| - β Spam keyword highlighting | |
| - β Confidence threshold control | |
| - β Batch analysis for multiple emails | |
| - β Safety tips for spam emails | |
| ### π§ Dataset | |
| - **Enron Email Dataset** β Real corporate emails | |
| - 31,716 training samples | |
| - Industry standard benchmark | |
| ### π¨βπ» Built By | |
| Umranz β [HuggingFace Profile](https://huggingface.co/Umranz) | |
| > β οΈ This tool is for educational purposes. | |
| > Always use proper email security tools in production. | |
| """) | |
| demo.launch(theme=gr.themes.Soft()) |