Spaces:

princemaxp
/

CySecGuardians

Sleeping

App Files Files Community

princemaxp commited on 18 days ago

Commit

b00d456

verified ·

1 Parent(s): 1ad7be9

Update header_analyzer.py

Browse files

Files changed (1) hide show

header_analyzer.py +112 -192

header_analyzer.py CHANGED Viewed

@@ -1,230 +1,150 @@
 import re
-import difflib
 import whois
-from datetime import datetime
-# -----------------------------
-# Brand intelligence (extendable)
-# -----------------------------
-BRAND_OFFICIAL = {
-    "paypal": ["paypal.com"],
-    "amazon": ["amazon.com"],
-    "google": ["google.com", "gmail.com"],
-    "microsoft": ["microsoft.com", "outlook.com", "live.com"],
-    "apple": ["apple.com"],
-    "flowtoscale": ["flowtoscale.com"],
-}
-SUSPICIOUS_TLDS = {"xyz", "top", "click", "work", "loan", "tk", "info"}
-FREE_EMAIL_PROVIDERS = {
-    "gmail.com", "yahoo.com", "outlook.com", "hotmail.com", "aol.com"
-}
-# -----------------------------
 # Helpers
-# -----------------------------
-def get_domain_age_days(domain: str):
     try:
         w = whois.whois(domain)
-        cd = w.creation_date
-        if isinstance(cd, list):
-            cd = cd[0]
-        if isinstance(cd, str):
-            try:
-                cd = datetime.fromisoformat(cd)
-            except Exception:
-                return None
-        if isinstance(cd, datetime):
-            return (datetime.now() - cd).days
     except Exception:
-        return None
     return None
-def extract_domain(header_value: str):
-    if not header_value:
-        return ""
-    m = re.search(r'@([a-zA-Z0-9.-]+)', header_value)
-    return m.group(1).lower() if m else ""
-def extract_display_name(from_header: str):
-    if not from_header:
-        return ""
-    m = re.match(r'\"?([^"<]+)\"?\s*<', from_header)
-    return m.group(1).strip().lower() if m else ""
-# -----------------------------
-# Authentication parsing (structured)
-# -----------------------------
-def parse_authentication_results(auth_header: str):
-    results = {
-        "spf": "unknown",
-        "dkim": "unknown",
-        "dmarc": "unknown",
-        "dmarc_policy": "unknown",
-    }
-    if not auth_header:
-        return results
-    auth = auth_header.lower()
     for key in ["spf", "dkim", "dmarc"]:
-        m = re.search(rf"{key}=(pass|fail|softfail|neutral|none|permerror|temperror)", auth)
         if m:
-            results[key] = m.group(1)
-    m = re.search(r"p=(none|quarantine|reject)", auth)
-    if m:
-        results["dmarc_policy"] = m.group(1)
-    return results
-# -----------------------------
-# Display-name spoofing (BEC core)
-# -----------------------------
-def detect_display_name_spoof(display_name: str, from_domain: str):
-    if not display_name or not from_domain:
-        return None
-    for brand, legit_domains in BRAND_OFFICIAL.items():
-        if brand in display_name:
-            if not any(from_domain.endswith(ld) for ld in legit_domains):
-                return (
-                    f"Display-name spoofing detected: "
-                    f"'{display_name}' but sender domain is {from_domain}"
-                )
-    return None
-# -----------------------------
-# Main Analyzer
-# -----------------------------
-def analyze_headers(headers, body=""):
-    """
-    Returns:
-    findings (list[str]),
-    score (int),
-    auth_summary (dict)
-    """
-    headers = headers or {}
     findings = []
     score = 0
-    # -----------------------------
-    # Extract headers
-    # -----------------------------
-    from_header = headers.get("From", "")
-    reply_to = headers.get("Reply-To", "")
-    return_path = headers.get("Return-Path", "")
-    auth_raw = headers.get("Authentication-Results") or headers.get("Authentication-results") or ""
-    from_domain = extract_domain(from_header)
     reply_domain = extract_domain(reply_to)
     return_domain = extract_domain(return_path)
-    display_name = extract_display_name(from_header)
-    # -----------------------------
-    # AUTHENTICATION ANALYSIS
-    # -----------------------------
-    auth = parse_authentication_results(auth_raw)
-    if auth["dmarc"] == "fail":
-        findings.append("Authentication failure: DMARC failed")
-        score += 40
-    if auth["spf"] in {"fail", "softfail"}:
-        findings.append("Authentication failure: SPF failed or soft-failed")
-        score += 25
-    if auth["dkim"] in {"fail", "permerror"}:
-        findings.append("Authentication failure: DKIM failed")
-        score += 25
-    if auth["dmarc"] == "pass" and auth["dmarc_policy"] == "none":
-        findings.append("Weak DMARC policy: p=none (monitoring only)")
         score += 10
-    # -----------------------------
-    # IDENTITY / BEC SIGNALS
-    # -----------------------------
-    spoof = detect_display_name_spoof(display_name, from_domain)
-    if spoof:
-        findings.append(spoof)
-        score += 45  # High confidence BEC signal
-    if from_domain and reply_domain and from_domain != reply_domain:
-        findings.append(
-            f"Reply-To mismatch: From domain {from_domain}, Reply-To domain {reply_domain}"
-        )
         score += 25
-    if return_domain and from_domain and return_domain != from_domain:
-        findings.append(
-            f"Return-Path mismatch: From domain {from_domain}, Return-Path {return_domain}"
-        )
-        score += 20
-    # -----------------------------
-    # DOMAIN REPUTATION & AGE
-    # -----------------------------
     if from_domain:
-        if from_domain in FREE_EMAIL_PROVIDERS:
-            findings.append(f"Free email provider used ({from_domain})")
-            score += 10
-        parts = from_domain.split(".")
-        tld = parts[-1]
-        if tld in SUSPICIOUS_TLDS:
-            findings.append(f"Suspicious or abused TLD detected ({tld})")
             score += 15
-        age_days = get_domain_age_days(from_domain)
-        if age_days is not None:
-            if age_days < 14:
-                findings.append(f"Domain is extremely new ({age_days} days old)")
-                score += 40
-            elif age_days < 30:
-                findings.append(f"Domain is newly registered ({age_days} days old)")
-                score += 25
-            elif age_days < 90:
-                findings.append(f"Domain is relatively new ({age_days} days old)")
-                score += 10
-        # Look-alike domain detection (smarter)
-        for brand, legit_domains in BRAND_OFFICIAL.items():
-            for legit in legit_domains:
-                ratio = difflib.SequenceMatcher(None, from_domain, legit).ratio()
-                if 0.75 <= ratio < 0.95 and from_domain != legit:
-                    findings.append(
-                        f"Possible look-alike domain spoofing ({from_domain} vs {legit})"
-                    )
-                    score += 30
-    # -----------------------------
-    # HEADER ANOMALIES (LOW NOISE)
-    # -----------------------------
-    if headers.get("Bcc") or headers.get("bcc"):
-        findings.append("BCC header present (mass-mailing / phishing indicator)")
-        score += 8
-    if not headers.get("Message-ID"):
-        findings.append("Missing Message-ID header")
-        score += 8
-    # -----------------------------
-    # FINALIZE
-    # -----------------------------
-    score = int(max(0, min(score, 100)))
-    if not findings:
-        return ["No suspicious issues found in headers."], 0, auth
-    return findings, score, auth

 import re
+import email
 import whois
+from email.utils import parseaddr
+from datetime import datetime, timezone
+# ---------------------------
 # Helpers
+# ---------------------------
+def extract_domain(addr: str):
+    if not addr:
+        return None
+    return addr.split("@")[-1].lower().strip()
+def safe_lower(value):
+    return value.lower() if isinstance(value, str) else ""
+def days_old(domain):
     try:
         w = whois.whois(domain)
+        created = w.creation_date
+        if isinstance(created, list):
+            created = created[0]
+        if created:
+            return (datetime.now(timezone.utc) - created).days
     except Exception:
+        pass
     return None
+def looks_like_domain_spoof(display_name, from_domain):
+    if not display_name or not from_domain:
+        return False
+    dn = safe_lower(display_name)
+    fd = safe_lower(from_domain.split(".")[0])
+    return fd in dn and not dn.endswith(from_domain)
+# ---------------------------
+# AUTH RESULTS PARSING
+# ---------------------------
+def parse_auth_results(headers):
+    auth = {"spf": "unknown", "dkim": "unknown", "dmarc": "unknown", "policy": None}
+    raw = headers.get("Authentication-Results", "")
+    raw = safe_lower(raw)
     for key in ["spf", "dkim", "dmarc"]:
+        m = re.search(rf"{key}=(pass|fail|softfail|neutral|none)", raw)
         if m:
+            auth[key] = m.group(1)
+    pol = re.search(r"dmarc=.*?policy=(none|quarantine|reject)", raw)
+    if pol:
+        auth["policy"] = pol.group(1)
+    return auth
+# ---------------------------
+# MAIN ANALYZER
+# ---------------------------
+def analyze_headers(headers: dict):
     findings = []
     score = 0
+    # --- Extract addresses ---
+    from_addr = parseaddr(headers.get("From", ""))[1]
+    reply_to = parseaddr(headers.get("Reply-To", ""))[1]
+    return_path = parseaddr(headers.get("Return-Path", ""))[1]
+    display_name = parseaddr(headers.get("From", ""))[0]
+    from_domain = extract_domain(from_addr)
     reply_domain = extract_domain(reply_to)
     return_domain = extract_domain(return_path)
+    # --- AUTH CHECKS ---
+    auth = parse_auth_results(headers)
+    if auth["spf"] in ["fail", "softfail"]:
+        findings.append("SPF authentication failed")
+        score += 15
+    if auth["dkim"] == "fail":
+        findings.append("DKIM authentication failed")
+        score += 15
+    if auth["dmarc"] == "fail":
+        findings.append("DMARC authentication failed")
+        score += 20
+    if auth["policy"] in ["quarantine", "reject"] and auth["dmarc"] != "pass":
+        findings.append("DMARC policy enforcement triggered")
         score += 10
+    # --- DISPLAY NAME SPOOFING ---
+    if looks_like_domain_spoof(display_name, from_domain):
+        findings.append("Display name spoofing detected")
+        score += 20
+    # --- BEC / REPLY-TO ---
+    if reply_domain and reply_domain != from_domain:
+        findings.append("Reply-To domain mismatch (possible BEC)")
         score += 25
+    if return_domain and return_domain != from_domain:
+        findings.append("Return-Path domain mismatch")
+        score += 15
+    # --- DOMAIN AGE ---
     if from_domain:
+        age = days_old(from_domain)
+        if age is not None and age < 90:
+            findings.append(f"Sender domain is newly registered ({age} days old)")
+            score += 20
+    # --- RECEIVED CHAIN ANALYSIS ---
+    received_headers = headers.get_all("Received", [])
+    if received_headers and len(received_headers) > 8:
+        findings.append("Unusually long Received header chain")
+        score += 10
+    if received_headers:
+        first = safe_lower(received_headers[-1])
+        last = safe_lower(received_headers[0])
+        if "localhost" in first or "127.0.0.1" in first:
+            findings.append("Suspicious mail origination (localhost)")
             score += 15
+        if "unknown" in last:
+            findings.append("Unknown sending host detected")
+            score += 10
+    # --- ALIGNMENT SUMMARY ---
+    if auth["spf"] == "pass" and auth["dkim"] == "pass" and auth["dmarc"] == "pass":
+        findings.append("Email authentication alignment passed")
+    # --- CLAMP SCORE ---
+    score = min(score, 100)
+    auth_summary = {
+        "SPF": auth["spf"],
+        "DKIM": auth["dkim"],
+        "DMARC": auth["dmarc"],
+        "DMARC Policy": auth["policy"] or "unknown",
+        "From Domain": from_domain,
+        "Reply-To Domain": reply_domain,
+        "Return-Path Domain": return_domain,
+    }
+    return findings, score, auth_summary