Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on Sep 24, 2025

Commit

089b374

verified ·

1 Parent(s): c3f2fbf

Update body_analyzer.py

Browse files

Files changed (1) hide show

body_analyzer.py +74 -189

body_analyzer.py CHANGED Viewed

@@ -1,60 +1,42 @@
-# body_analyzer.py
 import os
 import re
 import requests
-import base64
-import io
 from typing import List
 HF_API_KEY = os.getenv("HF_API_KEY")
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
 HF_TIMEOUT = 20  # seconds
-# ML model names
-PHISHING_MODEL = "cybersectony/phishing-email-detection-distilbert_v2.4.1"
 ZERO_SHOT_MODEL = "facebook/bart-large-mnli"  # for intent/behavior
-# Suspicious phrase patterns (lowercased when matching)
 SUSPICIOUS_PATTERNS = [
-    "verify your account",
-    "urgent action",
-    "click here",
-    "reset password",
-    "confirm your identity",
-    "bank account",
-    "invoice",
-    "payment required",
-    "unauthorized login",
-    "compromised",
-    "final reminder",
-    "account suspended",
-    "account deactivated",
-    "update your information",
-    "legal action",
-    "limited time offer",
-    "claim your prize",
-    "verify immediately",
-    "verify now",
-    "verify your credentials",
 ]
 # zero-shot candidate labels for message behavior
 BEHAVIOR_LABELS = [
-    "credential harvesting",
-    "invoice/payment fraud",
-    "marketing",
-    "benign",
-    "malware",
-    "account takeover",
 ]
 def _call_hf_text_model(model_name: str, text: str):
-    """Call HF Inference API for text. Return raw JSON or None on failure."""
     if not HF_API_KEY:
         return None
     try:
         payload = {"inputs": text}
-        # For zero-shot, caller will pass parameters in payload if needed
         res = requests.post(
             f"https://api-inference.huggingface.co/models/{model_name}",
             headers=HF_HEADERS,
@@ -66,6 +48,7 @@ def _call_hf_text_model(model_name: str, text: str):
         return None
 def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
     if not HF_API_KEY:
         return None
     try:
@@ -80,107 +63,35 @@ def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
     except Exception:
         return None
-def _call_hf_image_ocr(model_name: str, image_bytes: bytes):
-    """
-    Call HF image OCR model endpoint. Returns string or None.
-    Uses raw bytes upload: content-type application/octet-stream body.
-    """
-    if not HF_API_KEY:
-        return None
-    try:
-        headers = HF_HEADERS.copy()
-        headers["Content-Type"] = "application/octet-stream"
-        res = requests.post(
-            f"https://api-inference.huggingface.co/models/{model_name}",
-            headers=headers,
-            data=image_bytes,
-            timeout=HF_TIMEOUT,
-        )
-        # Many vision models return {"generated_text": "..."} or list; attempt to parse common shapes
-        data = res.json()
-        if isinstance(data, dict):
-            # TrOCR-style may return {"generated_text": "..."}
-            if "generated_text" in data:
-                return data["generated_text"]
-            # Some OCR endpoints may return list of dicts
-        if isinstance(data, list) and data and isinstance(data[0], dict):
-            # choose text-like fields if present
-            candidate = data[0].get("generated_text") or data[0].get("text") or data[0].get("caption")
-            return candidate
-        # fallback: try string concatenation if possible
-        if isinstance(data, str):
-            return data
-    except Exception:
-        pass
-    return None
-# local pytesseract fallback
-def _ocr_local_pytesseract(image_bytes):
-    try:
-        from PIL import Image
-        import pytesseract
-        import io
-        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        text = pytesseract.image_to_string(image)
-        return text
-    except Exception:
-        return None
-def _parse_hf_phishing_model_output(result):
-    """
-    Expected: model may return list of logits/probs. Try common shapes.
-    Returns: label:str, confidence:float (0..1), all_probs:dict
-    """
     if not result:
-        return None, 0.0, {}
-    # if list of dicts with label & score
     if isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
-        r0 = result[0]
-        label = r0.get("label")
-        score = r0.get("score", 0.0)
-        return label, float(score or 0.0), {label: float(score or 0.0)}
-    # if dict with labels & scores
-    if isinstance(result, dict):
-        # sometimes returns {'labels': [...], 'scores': [...]}
-        labels = result.get("labels") or result.get("label") or []
-        scores = result.get("scores") or result.get("score") or []
-        if isinstance(labels, list) and isinstance(scores, list) and labels and scores:
-            all_probs = {lab: float(sc) for lab, sc in zip(labels, scores)}
-            # pick max
-            max_lab = max(all_probs.items(), key=lambda x: x[1])
-            return max_lab[0], float(max_lab[1]), all_probs
-    return None, 0.0, {}
 def analyze_body(subject: str, body: str, urls: list, images: list):
-    """
-    Inputs:
-      subject: email subject (str)
-      body: plaintext combined body (str)
-      urls: list of urls
-      images: list of image bytes
-    Returns:
-      findings (list[str]), score (int 0..100), highlighted_body (str), verdict (str)
-    """
     findings = []
     score = 0
-    highlighted_body = (body or "")  # will attempt to highlight suspicious text/URLs
-    # 1) Basic heuristics on subject + body
-    combined_lower = ((subject or "") + "\n" + (body or "")).lower()
     for pattern in SUSPICIOUS_PATTERNS:
-        if pattern in combined_lower:
             findings.append(f"Suspicious phrase detected: \"{pattern}\"")
-            # weight subject phrases more heavily
-            if pattern in (subject or "").lower():
-                score += 30
-            else:
-                score += 18
             try:
                 highlighted_body = re.sub(re.escape(pattern), f"<mark>{pattern}</mark>", highlighted_body, flags=re.IGNORECASE)
             except Exception:
                 pass
-    # 2) URL heuristics (always include)
     for u in urls or []:
         findings.append(f"Suspicious URL detected: {u}")
         score += 10
@@ -188,7 +99,6 @@ def analyze_body(subject: str, body: str, urls: list, images: list):
             highlighted_body = re.sub(re.escape(u), f"<mark>{u}</mark>", highlighted_body, flags=re.IGNORECASE)
         except Exception:
             pass
-        # suspicious domain structure bump
         domain_match = re.search(r"https?://([^/]+)/?", u)
         if domain_match:
             domain = domain_match.group(1)
@@ -196,76 +106,40 @@ def analyze_body(subject: str, body: str, urls: list, images: list):
                 findings.append(f"URL: suspicious-looking domain {domain}")
                 score += 10
-    # 3) OCR images
-    ocr_texts = []
-    if images:
-        for img_bytes in images:
-            text = None
-            # Prefer HF TrOCR-like endpoint if HF_API_KEY provided
-            if HF_API_KEY:
-                # try a well-known OCR-capable model; TrOCR base is a candidate
-                ocr_result = _call_hf_image_ocr("microsoft/trocr-base-stage1", img_bytes)
-                if ocr_result:
-                    text = ocr_result
-            if not text:
-                # fallback to local pytesseract
-                text = _ocr_local_pytesseract(img_bytes)
-            if text:
-                ocr_texts.append(text)
-                findings.append("OCR: extracted text from image.")
-                # add small heuristic score for OCR results
-                lower = text.lower()
-                for pat in SUSPICIOUS_PATTERNS:
-                    if pat in lower:
-                        findings.append(f"OCR: suspicious phrase in image -> \"{pat}\"")
-                        score += 20
-    # 4) ML phishing model (Hugging Face)
-    ml_label = None
-    ml_conf = 0.0
-    ml_all = {}
-    model_input = "\n".join([subject or "", body or "", "\n".join(urls or []), "\n".join(ocr_texts or [])]).strip()
-    if model_input and HF_API_KEY:
-        raw = _call_hf_text_model(PHISHING_MODEL, model_input)
-        label, conf, allp = _parse_hf_phishing_model_output(raw)
-        if label:
-            ml_label = label
-            ml_conf = conf
-            ml_all = allp
-            findings.append(f"HuggingFace phishing model → {label} (conf {conf:.2f})")
-            # confidence scaled to score (but cap)
-            score += int(conf * 100 * 0.9)  # slightly reduce to avoid double-counting
-    # 5) Zero-shot behavior intent model (when HF available)
-    behavior = None
     behavior_conf = 0.0
     if HF_API_KEY and model_input:
         zs = _call_hf_zero_shot(model_input, BEHAVIOR_LABELS)
-        try:
-            if isinstance(zs, dict) and "labels" in zs and "scores" in zs:
-                best_label = zs["labels"][0]
-                best_score = float(zs["scores"][0])
-                behavior = best_label
-                behavior_conf = best_score
-                findings.append(f"Behavior inference → {behavior} (conf {behavior_conf:.2f})")
-                # add modest boost for strong behavior confidence
-                if behavior_conf >= 0.7:
-                    score += int(behavior_conf * 30)
-        except Exception:
-            pass
-    # 6) Final heuristics fallbacks
-    # If ML already strongly flagged phishing, ensure high score
-    if ml_conf >= 0.8 and ("phishing" in (ml_label or "").lower()):
-        score = max(score, 80)
-    # clamp
-    try:
-        score = int(max(0, min(score, 100)))
-    except Exception:
-        score = 0
-    # Final verdict mapping (tunable)
     if score >= 70:
         verdict = "🚨 Malicious"
     elif 50 <= score < 70:
@@ -276,5 +150,16 @@ def analyze_body(subject: str, body: str, urls: list, images: list):
         verdict = "✅ Safe"
         findings.append("No strong phishing signals detected by models/heuristics.")
-    # Return findings, score, highlighted body (with possible <mark> tags), verdict
-    return findings, score, highlighted_body, verdict

+# body_analyzer_v2.py
 import os
 import re
 import requests
 from typing import List
 HF_API_KEY = os.getenv("HF_API_KEY")
 HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
 HF_TIMEOUT = 20  # seconds
+# Hugging Face model names
+PHISHING_MODELS = [
+    "cybersectony/phishing-email-detection-distilbert_v2.4.1",
+    "ealvaradob/bert-finetuned-phishing"
+]
 ZERO_SHOT_MODEL = "facebook/bart-large-mnli"  # for intent/behavior
+# Suspicious phrase patterns
 SUSPICIOUS_PATTERNS = [
+    "verify your account", "urgent action", "click here", "reset password",
+    "confirm your identity", "bank account", "invoice", "payment required",
+    "unauthorized login", "compromised", "final reminder", "account suspended",
+    "account deactivated", "update your information", "legal action",
+    "limited time offer", "claim your prize", "verify immediately",
+    "verify now", "verify your credentials",
 ]
 # zero-shot candidate labels for message behavior
 BEHAVIOR_LABELS = [
+    "credential harvesting", "invoice/payment fraud", "marketing",
+    "benign", "malware", "account takeover",
 ]
 def _call_hf_text_model(model_name: str, text: str):
+    """Call HF Inference API for text classification"""
     if not HF_API_KEY:
         return None
     try:
         payload = {"inputs": text}
         res = requests.post(
             f"https://api-inference.huggingface.co/models/{model_name}",
             headers=HF_HEADERS,
         return None
 def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
+    """Zero-shot classification for email behavior/intent"""
     if not HF_API_KEY:
         return None
     try:
     except Exception:
         return None
+def _parse_hf_model_output(result):
+    """Extract label and confidence from HF output"""
     if not result:
+        return None, 0.0
     if isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
+        label = result[0].get("label")
+        score = result[0].get("score", 0.0)
+        return label, float(score or 0.0)
+    if isinstance(result, dict) and "labels" in result and "scores" in result:
+        return result["labels"][0], float(result["scores"][0])
+    return None, 0.0
 def analyze_body(subject: str, body: str, urls: list, images: list):
     findings = []
     score = 0
+    highlighted_body = body or ""
+    combined_text = f"{subject}\n{body}".lower()
+    # 1) Basic heuristics: suspicious phrases
     for pattern in SUSPICIOUS_PATTERNS:
+        if pattern in combined_text:
             findings.append(f"Suspicious phrase detected: \"{pattern}\"")
+            score += 30 if pattern in (subject or "").lower() else 18
             try:
                 highlighted_body = re.sub(re.escape(pattern), f"<mark>{pattern}</mark>", highlighted_body, flags=re.IGNORECASE)
             except Exception:
                 pass
+    # 2) URL heuristics
     for u in urls or []:
         findings.append(f"Suspicious URL detected: {u}")
         score += 10
             highlighted_body = re.sub(re.escape(u), f"<mark>{u}</mark>", highlighted_body, flags=re.IGNORECASE)
         except Exception:
             pass
         domain_match = re.search(r"https?://([^/]+)/?", u)
         if domain_match:
             domain = domain_match.group(1)
                 findings.append(f"URL: suspicious-looking domain {domain}")
                 score += 10
+    # 3) ML Phishing detection using multiple HF models
+    ml_labels = []
+    ml_confidences = []
+    model_input = "\n".join([subject or "", body or ""] + (urls or []))
+    for phish_model in PHISHING_MODELS:
+        if HF_API_KEY and model_input:
+            result = _call_hf_text_model(phish_model, model_input)
+            label, conf = _parse_hf_model_output(result)
+            if label:
+                findings.append(f"HF phishing model ({phish_model}) → {label} (conf {conf:.2f})")
+                ml_labels.append(label)
+                ml_confidences.append(conf)
+    # Take the max confidence phishing prediction
+    if ml_confidences:
+        max_idx = ml_confidences.index(max(ml_confidences))
+        if "phish" in (ml_labels[max_idx] or "").lower():
+            score += int(ml_confidences[max_idx] * 100 * 0.9)
+    # 4) Zero-shot intent/behavior classification
+    behavior_label = None
     behavior_conf = 0.0
     if HF_API_KEY and model_input:
         zs = _call_hf_zero_shot(model_input, BEHAVIOR_LABELS)
+        if isinstance(zs, dict) and "labels" in zs and "scores" in zs:
+            behavior_label = zs["labels"][0]
+            behavior_conf = float(zs["scores"][0])
+            findings.append(f"Behavior inference → {behavior_label} (conf {behavior_conf:.2f})")
+            if behavior_conf >= 0.7:
+                score += int(behavior_conf * 30)
+    # 5) Final score clamping
+    score = max(0, min(score, 100))
+    # 6) Verdict
     if score >= 70:
         verdict = "🚨 Malicious"
     elif 50 <= score < 70:
         verdict = "✅ Safe"
         findings.append("No strong phishing signals detected by models/heuristics.")
+    # 7) Richer textual summary (like your example)
+    summary = f"""
+Email analysis summary:
+- Subject: {subject}
+- Body length: {len(body)} chars
+- Detected behavior/intent: {behavior_label} (conf {behavior_conf:.2f})
+- Top phishing alert: {ml_labels[max_idx] if ml_labels else 'None'}
+- Suspicious phrases found: {len([f for f in findings if 'Suspicious phrase' in f])}
+- Total score: {score}/100
+Verdict: {verdict}
+"""
+    return findings, score, highlighted_body, verdict, summary