Spaces:

harshitmahour360
/

Phising

Sleeping

App Files Files Community

harshitmahour360 commited on Nov 26, 2025

Commit

ae1a5c5

verified ·

1 Parent(s): ef661e9

Create app.py

Browse files

Files changed (1) hide show

app.py +307 -0

app.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import re
+import traceback
+from typing import Dict, Any, Tuple
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+# ---------------------------------------------------------
+# 1. CONFIG
+# ---------------------------------------------------------
+# Small, fast phishing / spam-style classifier
+MODEL_NAME = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
+clf_pipe = None
+model_load_error = None
+# Heuristic keywords and weights (tuned for "phishy" language)
+SUSPICIOUS_KEYWORDS = {
+    "verify your account": 0.25,
+    "reset your password": 0.25,
+    "confirm your password": 0.2,
+    "click the link below": 0.15,
+    "click here": 0.15,
+    "urgent": 0.1,
+    "immediately": 0.1,
+    "limited time": 0.1,
+    "suspend your account": 0.25,
+    "update your billing": 0.2,
+    "unusual activity": 0.2,
+    "bank account": 0.15,
+    "card details": 0.15,
+    "one time password": 0.15,
+    "otp": 0.1,
+}
+URL_SHORTENERS = [
+    "bit.ly", "tinyurl.com", "t.co", "is.gd", "ow.ly", "buff.ly", "cutt.ly"
+]
+URL_REGEX = re.compile(r"https?://\S+")
+# ---------------------------------------------------------
+# 2. LOAD MODEL ON STARTUP
+# ---------------------------------------------------------
+def _load_model():
+    global clf_pipe, model_load_error
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+        clf_pipe = pipeline(
+            "text-classification",
+            model=model,
+            tokenizer=tokenizer,
+            return_all_scores=True,
+        )
+        print(f"[INFO] Loaded HF model: {MODEL_NAME}")
+    except Exception as e:
+        model_load_error = f"Failed to load model {MODEL_NAME}: {type(e).__name__}: {e}"
+        clf_pipe = None
+        print("[ERROR]", model_load_error)
+_load_model()
+# ---------------------------------------------------------
+# 3. HELPER FUNCTIONS
+# ---------------------------------------------------------
+def compute_model_spam_prob(text: str, outputs) -> Tuple[float, Dict[str, float]]:
+    """
+    Convert HF pipeline outputs into a spam / phishing probability.
+    """
+    label_scores = {o["label"]: float(o["score"]) for o in outputs}
+    # Try labels that explicitly mention spam/phish first
+    spam_prob = 0.0
+    for lab, score in label_scores.items():
+        ll = lab.lower()
+        if "spam" in ll or "phish" in ll:
+            spam_prob += score
+    # Fallback for LABEL_0 / LABEL_1 etc.
+    if spam_prob == 0.0 and len(label_scores) == 2:
+        # Heuristic: the "1" label is spam
+        spam_label = None
+        for lab in label_scores:
+            if "1" in lab or "spam" in lab.lower():
+                spam_label = lab
+                break
+        if spam_label is None:
+            # Just take the label with the higher score
+            spam_label = max(label_scores, key=label_scores.get)
+        spam_prob = label_scores.get(spam_label, 0.0)
+    # Clamp
+    spam_prob = max(0.0, min(1.0, float(spam_prob)))
+    return spam_prob, label_scores
+def compute_heuristic_score(text: str) -> Tuple[float, Dict[str, Any]]:
+    """
+    Very lightweight rule-based scoring: keywords + URLs.
+    Returns (probability 0-1, info dict).
+    """
+    lowered = text.lower()
+    score = 0.0
+    keyword_hits = []
+    for phrase, weight in SUSPICIOUS_KEYWORDS.items():
+        if phrase in lowered:
+            score += weight
+            keyword_hits.append(phrase)
+    urls = URL_REGEX.findall(text)
+    url_flags = []
+    for url in urls:
+        u_lower = url.lower()
+        for short in URL_SHORTENERS:
+            if short in u_lower:
+                score += 0.2
+                url_flags.append(f"URL shortener detected: {url}")
+                break
+        if u_lower.startswith("http://"):
+            score += 0.1
+            url_flags.append(f"Insecure (http) URL: {url}")
+    # Slight boost if email is extremely short AND contains a link
+    if len(text) < 60 and urls:
+        score += 0.1
+        url_flags.append("Very short message that mainly contains a link")
+    # Normalize and clamp to [0, 1]
+    # Empirically, 0.7 is already very suspicious, so divide by 1.5
+    score = score / 1.5
+    score = max(0.0, min(1.0, score))
+    info = {
+        "keywords_triggered": keyword_hits,
+        "urls_found": urls,
+        "url_warnings": url_flags,
+        "raw_heuristic_score": score,
+    }
+    return score, info
+# ---------------------------------------------------------
+# 4. MAIN INFERENCE FUNCTION
+# ---------------------------------------------------------
+def analyze_email(email_text: str):
+    try:
+        text = (email_text or "").strip()
+        if not text:
+            return (
+                "❌ No email text provided",
+                0.0,
+                {"error": "Please paste the full email body first."},
+            )
+        if clf_pipe is None:
+            return (
+                "❌ Model failed to load",
+                0.0,
+                {
+                    "error": model_load_error,
+                    "hint": "Check the Space logs or requirements.txt.",
+                },
+            )
+        # --- Model-based score ---
+        model_outputs = clf_pipe(text, truncation=True, max_length=512)[0]
+        model_spam_prob, label_scores = compute_model_spam_prob(text, model_outputs)
+        # --- Heuristic score ---
+        heuristic_prob, heuristic_info = compute_heuristic_score(text)
+        # --- Combine scores (70% model, 30% heuristics) ---
+        final_prob = 0.7 * model_spam_prob + 0.3 * heuristic_prob
+        final_prob = max(0.0, min(1.0, float(final_prob)))
+        verdict = (
+            "⚠️ Likely phishing / suspicious"
+            if final_prob >= 0.5
+            else "✅ Likely not phishing (still be cautious)"
+        )
+        details = {
+            "model_name": MODEL_NAME,
+            "model_spam_probability": model_spam_prob,
+            "model_label_scores": label_scores,
+            "heuristic_probability": heuristic_prob,
+            "heuristics": heuristic_info,
+            "final_combined_probability": final_prob,
+        }
+        return verdict, round(final_prob, 4), details
+    except Exception as e:
+        # Never crash the Space; always return something
+        return (
+            "❌ Internal error during analysis",
+            0.0,
+            {
+                "exception": f"{type(e).__name__}: {e}",
+                "traceback": traceback.format_exc(),
+            },
+        )
+# ---------------------------------------------------------
+# 5. GRADIO UI
+# ---------------------------------------------------------
+with gr.Blocks(title="Phishing / Spam Email Detector (Hybrid)") as demo:
+    gr.Markdown(
+        """
+# 🛡️ Phishing / Spam Email Detector (Hybrid)
+This tool combines:
+1. A **Hugging Face spam/phishing classifier** (`bert-tiny` – fast on CPU), and
+2. A lightweight **rule-based engine** (keywords + URL checks).
+It outputs a final phishing probability and a structured JSON explanation.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.Markdown("### ✉️ Email Content")
+            email_input = gr.Textbox(
+                lines=16,
+                placeholder="Paste the full email text here...",
+                label="Email body",
+            )
+            with gr.Row():
+                analyze_btn = gr.Button("🔍 Analyze", variant="primary")
+                clear_btn = gr.Button("🧹 Clear")
+        with gr.Column(scale=2):
+            gr.Markdown("### 🧾 Result")
+            verdict_out = gr.Textbox(
+                label="Overall verdict",
+                interactive=False,
+            )
+            prob_out = gr.Number(
+                label="Phishing probability (0–1)",
+                precision=4,
+            )
+            details_out = gr.JSON(
+                label="Details (model + heuristics)",
+            )
+    examples = [
+        [
+            """Subject: Important – Verify Your Account Now
+Dear User,
+We have detected unusual activity on your account. To avoid suspension, please verify your account immediately by clicking the link below:
+http://secure-update.example-login.com/verify
+Failure to do so will result in permanent closure of your account.
+Thank you,
+Security Team"""
+        ],
+        [
+            """Subject: Your Monthly Newsletter is Here!
+Hello Harshit,
+We’re excited to share this month’s updates, new features, and upcoming events with you.
+No action is required—just click below to explore:
+https://example.com/newsletter
+Have a great day!
+Team Example"""
+        ],
+    ]
+    gr.Examples(
+        examples=examples,
+        inputs=[email_input],
+        label="Try some example emails",
+    )
+    analyze_btn.click(
+        fn=analyze_email,
+        inputs=email_input,
+        outputs=[verdict_out, prob_out, details_out],
+    )
+    clear_btn.click(
+        fn=lambda: ("", "", 0.0, {}),
+        inputs=None,
+        outputs=[email_input, verdict_out, prob_out, details_out],
+    )
+if __name__ == "__main__":
+    demo.launch()