Spaces:

princemaxp
/

CySecGuardians

Running

App Files Files Community

princemaxp commited on Sep 24, 2025

Commit

e99affa

verified ·

1 Parent(s): e9f9e54

Update header_analyzer.py

Browse files

Files changed (1) hide show

header_analyzer.py +156 -158

header_analyzer.py CHANGED Viewed

@@ -1,167 +1,165 @@
 import re
-import difflib
-import whois
-from datetime import datetime
-# Official brand domains (extend as needed)
-BRAND_OFFICIAL = {
-    "paypal": ["paypal.com"],
-    "amazon": ["amazon.com"],
-    "google": ["google.com", "gmail.com"],
-    "microsoft": ["microsoft.com", "outlook.com", "live.com"],
-    "apple": ["apple.com"],
-    "flowtoscale": ["flowtoscale.com"],  # Example from your case
-}
-# Suspicious / cheap TLDs often abused
-SUSPICIOUS_TLDS = {"info", "xyz", "top", "click", "work", "loan", "tk"}
-def get_domain_age_days(domain: str):
-    """Return domain age in days (or None if lookup fails)."""
     try:
-        w = whois.whois(domain)
-        creation_date = w.creation_date
-        if isinstance(creation_date, list):  # sometimes returns list
-            creation_date = creation_date[0]
-        if creation_date:
-            return (datetime.now() - creation_date).days
     except Exception:
         return None
-    return None
-def parse_auth_results(auth_header: str):
-    """
-    Parse the Authentication-Results header and return a readable summary.
-    """
-    auth_header = (auth_header or "").lower()
-    findings = []
-    if not auth_header:
-        return "No Authentication-Results header found"
-    # SPF
-    if "spf=pass" in auth_header:
-        findings.append("SPF passed")
-    elif "spf=fail" in auth_header:
-        findings.append("SPF failed")
-    # DKIM
-    if "dkim=pass" in auth_header:
-        findings.append("DKIM passed")
-    elif "dkim=fail" in auth_header or "dkim=permerror" in auth_header:
-        findings.append("DKIM failed")
-    # DMARC
-    if "dmarc=pass" in auth_header:
-        findings.append("DMARC passed")
-    elif "dmarc=fail" in auth_header:
-        findings.append("DMARC failed")
-    if not findings:
-        return "Authentication results unclear or missing"
-    return ", ".join(findings)
-def analyze_headers(headers, body=""):
-    """
-    Input: headers dict, optional body text
-    Output: (findings: list[str], score: int, auth_summary: str)
-    """
     findings = []
     score = 0
-    headers = headers or {}
-    auth_results = (headers.get("Authentication-Results") or headers.get("Authentication-results") or "").lower()
-    # Strict auth failures
-    if "dkim=fail" in auth_results or "dkim=permerror" in auth_results:
-        findings.append("Header: DKIM check failed")
-        score += 30
-    if "spf=fail" in auth_results:
-        findings.append("Header: SPF check failed")
-        score += 30
-    if "dmarc=fail" in auth_results:
-        findings.append("Header: DMARC check failed")
-        score += 30
-    # Softer auth problems
-    if any(x in auth_results for x in ["spf=softfail", "spf=neutral", "spf=none"]):
-        findings.append("Header: SPF not properly aligned")
         score += 10
-    if any(x in auth_results for x in ["dmarc=temperror", "dkim=temperror"]):
-        findings.append("Header: Temporary auth errors (DKIM/DMARC)")
-        score += 5
-    # From and Reply-To domain compare
-    from_addr = headers.get("From", "") or ""
-    reply_to = headers.get("Reply-To", "") or ""
-    from_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', from_addr)
-    reply_domain_m = re.search(r'@([a-zA-Z0-9.-]+)', reply_to)
-    if from_domain_m and reply_domain_m:
-        from_domain = from_domain_m.group(1).lower()
-        reply_domain = reply_domain_m.group(1).lower()
-        if from_domain != reply_domain:
-            findings.append(f"Header: Reply-To domain mismatch (From: {from_domain}, Reply-To: {reply_domain})")
-            score += 20
     else:
-        from_domain = from_domain_m.group(1).lower() if from_domain_m else ""
-    # Sender domain analysis
-    if from_domain:
-        parts = from_domain.split('.')
-        tld = parts[-1]
-        # free provider detection
-        if from_domain in ["gmail.com", "yahoo.com", "outlook.com", "hotmail.com"]:
-            findings.append(f"Header: Free email provider used ({from_domain})")
-            score += 8
-        # suspicious domain structure
-        if len(parts) > 4 or (parts and any(ch.isdigit() for ch in parts[0])):
-            findings.append(f"Header: Suspicious-looking domain structure ({from_domain})")
-            score += 15
-        # suspicious TLD
-        if tld in SUSPICIOUS_TLDS:
-            findings.append(f"Header: Suspicious/abused TLD used ({tld})")
-            score += 20
-        # Domain age check
-        age_days = get_domain_age_days(from_domain)
-        if age_days is not None and age_days < 90:
-            findings.append(f"Header: Domain {from_domain} is very new ({age_days} days old)")
-            score += 35
-        # brand-squatting / look-alike check
-        for brand, official_list in BRAND_OFFICIAL.items():
-            if brand in from_domain:
-                is_official = any(
-                    from_domain.endswith("." + off) or from_domain == off
-                    for off in official_list
-                )
-                if not is_official:
-                    findings.append(f"Header: Domain contains brand '{brand}' but is not official ({from_domain})")
-                    score += 30
-            # fuzzy look-alike
-            for legit in official_list:
-                ratio = difflib.SequenceMatcher(None, from_domain, legit).ratio()
-                if ratio > 0.7 and from_domain != legit:
-                    findings.append(f"Header: Possible look-alike spoofing ({from_domain} vs {legit})")
-                    score += 40
-        # Content-to-domain mismatch (organization spoofing)
-        if body and "ravenmail" in body.lower() and "ravenmail" not in from_domain:
-            findings.append("Header/Content: Possible spoofing — mentions RavenMail but sender domain is unrelated")
-            score += 40
-    # Bcc usage
-    if headers.get("Bcc") or headers.get("bcc"):
-        findings.append("Header: Email sent with BCC (common in mass phishing)")
-        score += 12
-    if not findings:
-        return ["No suspicious issues found in headers."], 0, "No Authentication-Results header found"
-    # Return findings, cumulative score, and parsed authentication summary
-    return findings, score, parse_auth_results(auth_results)

+# body_analyzer.py
+import os
 import re
+import requests
+from typing import List
+HF_API_KEY = os.getenv("HF_API_KEY")
+HF_HEADERS = {"Authorization": f"Bearer {HF_API_KEY}"} if HF_API_KEY else {}
+HF_TIMEOUT = 20  # seconds
+# ML model names
+PHISHING_MODEL = "cybersectony/phishing-email-detection-distilbert_v2.4.1"
+ZERO_SHOT_MODEL = "facebook/bart-large-mnli"  # for intent/behavior
+# Suspicious phrase patterns
+SUSPICIOUS_PATTERNS = [
+    "verify your account",
+    "urgent action",
+    "click here",
+    "reset password",
+    "confirm your identity",
+    "bank account",
+    "invoice",
+    "payment required",
+    "unauthorized login",
+    "compromised",
+    "final reminder",
+    "account suspended",
+    "account deactivated",
+    "update your information",
+    "legal action",
+    "limited time offer",
+    "claim your prize",
+    "verify immediately",
+    "verify now",
+    "verify your credentials",
+]
+# Zero-shot candidate labels for intent/behavior
+BEHAVIOR_LABELS = [
+    "credential harvesting",
+    "invoice/payment fraud",
+    "marketing",
+    "benign",
+    "malware",
+    "account takeover",
+]
+def _call_hf_text_model(model_name: str, text: str):
+    if not HF_API_KEY:
+        return None
     try:
+        payload = {"inputs": text}
+        res = requests.post(
+            f"https://api-inference.huggingface.co/models/{model_name}",
+            headers=HF_HEADERS,
+            json=payload,
+            timeout=HF_TIMEOUT,
+        )
+        return res.json()
     except Exception:
         return None
+def _call_hf_zero_shot(text: str, candidate_labels: List[str]):
+    if not HF_API_KEY:
+        return None
+    try:
+        payload = {"inputs": text, "parameters": {"candidate_labels": candidate_labels}}
+        res = requests.post(
+            f"https://api-inference.huggingface.co/models/{ZERO_SHOT_MODEL}",
+            headers=HF_HEADERS,
+            json=payload,
+            timeout=HF_TIMEOUT,
+        )
+        return res.json()
+    except Exception:
+        return None
+def _parse_hf_phishing_model_output(result):
+    if not result:
+        return None, 0.0, {}
+    if isinstance(result, list) and result and isinstance(result[0], dict):
+        r0 = result[0]
+        label = r0.get("label")
+        score = r0.get("score", 0.0)
+        return label, float(score), {label: float(score)}
+    if isinstance(result, dict):
+        labels = result.get("labels") or result.get("label") or []
+        scores = result.get("scores") or result.get("score") or []
+        if isinstance(labels, list) and isinstance(scores, list) and labels and scores:
+            all_probs = {lab: float(sc) for lab, sc in zip(labels, scores)}
+            max_lab = max(all_probs.items(), key=lambda x: x[1])
+            return max_lab[0], float(max_lab[1]), all_probs
+    return None, 0.0, {}
+def analyze_body(subject: str, body: str, urls: list, images: list):
     findings = []
     score = 0
+    highlighted_body = (body or "")
+    combined_lower = ((subject or "") + "\n" + (body or "")).lower()
+    for pattern in SUSPICIOUS_PATTERNS:
+        if pattern in combined_lower:
+            findings.append(f"Suspicious phrase detected: \"{pattern}\"")
+            score += 18
+            try:
+                highlighted_body = re.sub(re.escape(pattern), f"<mark>{pattern}</mark>", highlighted_body, flags=re.IGNORECASE)
+            except Exception:
+                pass
+    # URL checks
+    for u in urls or []:
+        findings.append(f"Suspicious URL detected: {u}")
         score += 10
+        try:
+            highlighted_body = re.sub(re.escape(u), f"<mark>{u}</mark>", highlighted_body, flags=re.IGNORECASE)
+        except Exception:
+            pass
+    # ML phishing model
+    ml_label = None
+    ml_conf = 0.0
+    model_input = "\n".join([subject or "", body or "", "\n".join(urls or [])]).strip()
+    if model_input and HF_API_KEY:
+        raw = _call_hf_text_model(PHISHING_MODEL, model_input)
+        label, conf, _ = _parse_hf_phishing_model_output(raw)
+        if label:
+            ml_label = label
+            ml_conf = conf
+            findings.append(f"HuggingFace phishing model → {label} (conf {conf:.2f})")
+            score += int(conf * 100 * 0.9)
+    # Zero-shot behavior
+    behavior = None
+    behavior_conf = 0.0
+    if HF_API_KEY and model_input:
+        zs = _call_hf_zero_shot(model_input, BEHAVIOR_LABELS)
+        try:
+            if isinstance(zs, dict) and "labels" in zs and "scores" in zs:
+                behavior = zs["labels"][0]
+                behavior_conf = float(zs["scores"][0])
+                findings.append(f"Behavior inference → {behavior} (conf {behavior_conf:.2f})")
+                if behavior_conf >= 0.7:
+                    score += int(behavior_conf * 30)
+        except Exception:
+            pass
+    if ml_conf >= 0.8 and ("phishing" in (ml_label or "").lower()):
+        score = max(score, 80)
+    score = int(max(0, min(score, 100)))
+    # Verdict
+    if score >= 70:
+        verdict = "🚨 Malicious"
+    elif 50 <= score < 70:
+        verdict = "⚠️ Suspicious"
+    elif 30 <= score < 50:
+        verdict = "📩 Spam"
     else:
+        verdict = "✅ Safe"
+        findings.append("No strong phishing signals detected by models/heuristics.")
+    # Return exactly 4 values
+    return findings, score, highlighted_body, verdict