Spaces:
Sleeping
Sleeping
| """ | |
| PhishLens HTML Structural Anomaly Feature Module. | |
| Extracts 11 features from the HTML body of emails by parsing with BeautifulSoup. | |
| Phishing emails rely heavily on HTML tricks to hide malicious content, redirect | |
| users, and harvest credentials. | |
| Security rationale: HTML-based obfuscation is a primary evasion technique. | |
| Hidden text (display:none), form POST to attacker-controlled domains, and | |
| href/visible-text mismatches are reliable signals that cannot be faked without | |
| triggering feature flags. These features complement NLP features which only | |
| see the rendered/visible text. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from typing import Dict, List, Optional | |
| from urllib.parse import urlparse | |
| from bs4 import BeautifulSoup | |
| import tldextract | |
| from src.utils.logger import get_logger | |
| # Truncate HTML before BeautifulSoup to prevent exponential parse time on | |
| # monster HTML emails (multi-MB newsletters, base64-inlined images, etc.). | |
| # All structural signals (links, forms, hidden elements) appear early in the | |
| # document so truncating at 200 KB does not materially affect feature quality. | |
| _MAX_HTML_CHARS = 200_000 # 200 KB | |
| log = get_logger(__name__) | |
| # Hidden text CSS patterns — all known obfuscation techniques | |
| _HIDDEN_TEXT_PATTERNS = [ | |
| re.compile(r"display\s*:\s*none", re.IGNORECASE), | |
| re.compile(r"font-size\s*:\s*0", re.IGNORECASE), | |
| re.compile(r"color\s*:\s*(white|#fff|#ffffff|rgba\(255,255,255)", re.IGNORECASE), | |
| re.compile(r"visibility\s*:\s*hidden", re.IGNORECASE), | |
| re.compile(r"opacity\s*:\s*0(?!\.\d)", re.IGNORECASE), | |
| # Modern phishing CSS obfuscation techniques: | |
| re.compile(r"height\s*:\s*0px|height\s*:\s*0;", re.IGNORECASE), | |
| re.compile(r"max-height\s*:\s*0", re.IGNORECASE), | |
| re.compile(r"overflow\s*:\s*hidden", re.IGNORECASE), | |
| re.compile(r"text-indent\s*:\s*-\d{3,}", re.IGNORECASE), | |
| re.compile(r"clip\s*:\s*rect\s*\(\s*0", re.IGNORECASE), | |
| re.compile(r"mso-hide\s*:\s*all", re.IGNORECASE), # Outlook-specific hiding | |
| ] | |
| # Base64 data URI pattern | |
| _BASE64_DATA_RE = re.compile(r"data:[^;]+;base64,", re.IGNORECASE) | |
| # Tracking pixel pattern (1x1 images) | |
| _TRACKING_PIXEL_RE = re.compile(r'(width|height)\s*[=:]\s*["\']?1["\']?', re.IGNORECASE) | |
| def extract_html_features(html_body: str) -> Dict: | |
| """Extract 11 HTML structural anomaly features from an email HTML body. | |
| Args: | |
| html_body: Raw HTML string from the email body. | |
| Returns: | |
| Dict with 11 numeric HTML features. Returns zero defaults if html_body | |
| is empty or unparseable. | |
| """ | |
| if not html_body or not html_body.strip(): | |
| return _default_html_features() | |
| # Truncate oversized HTML to keep parse time bounded. | |
| if len(html_body) > _MAX_HTML_CHARS: | |
| html_body = html_body[:_MAX_HTML_CHARS] | |
| features = _default_html_features() | |
| try: | |
| soup = BeautifulSoup(html_body, "lxml") | |
| except Exception as exc: | |
| log.debug(f"BeautifulSoup parse error: {exc}") | |
| try: | |
| soup = BeautifulSoup(html_body, "html.parser") | |
| except Exception: | |
| return features | |
| try: | |
| features["href_text_mismatch_count"] = _count_href_text_mismatches(soup) | |
| except Exception as exc: | |
| log.debug(f"href_text_mismatch_count error: {exc}") | |
| try: | |
| features["external_form_action"] = int(_has_external_form_action(soup)) | |
| except Exception as exc: | |
| log.debug(f"external_form_action error: {exc}") | |
| try: | |
| features["hidden_text_count"] = _count_hidden_text_elements(soup) | |
| except Exception as exc: | |
| log.debug(f"hidden_text_count error: {exc}") | |
| try: | |
| features["image_to_text_ratio"] = _compute_image_to_text_ratio(soup) | |
| except Exception as exc: | |
| log.debug(f"image_to_text_ratio error: {exc}") | |
| try: | |
| features["tracking_pixel_count"] = _count_tracking_pixels(soup) | |
| except Exception as exc: | |
| log.debug(f"tracking_pixel_count error: {exc}") | |
| try: | |
| features["base64_content_count"] = _count_base64_content(soup) | |
| except Exception as exc: | |
| log.debug(f"base64_content_count error: {exc}") | |
| try: | |
| features["javascript_count"] = len(soup.find_all("script")) | |
| except Exception as exc: | |
| log.debug(f"javascript_count error: {exc}") | |
| try: | |
| features["external_css_count"] = _count_external_css(soup) | |
| except Exception as exc: | |
| log.debug(f"external_css_count error: {exc}") | |
| try: | |
| links = soup.find_all("a", href=True) | |
| features["total_links"] = len(links) | |
| domains = [_extract_link_domain(a["href"]) for a in links] | |
| domains = [d for d in domains if d] | |
| unique_domains = set(domains) | |
| features["unique_domains_in_links"] = len(unique_domains) | |
| if features["total_links"] > 0: | |
| features["link_domain_diversity"] = len(unique_domains) / features["total_links"] | |
| except Exception as exc: | |
| log.debug(f"link domain features error: {exc}") | |
| return features | |
| # --------------------------------------------------------------------------- | |
| # Feature implementations | |
| # --------------------------------------------------------------------------- | |
| def _count_href_text_mismatches(soup: BeautifulSoup) -> int: | |
| """Count anchor tags where visible text ≠ href URL. | |
| Also catches the IP-in-href trick: href points to a raw IP address | |
| but visible text shows a legitimate brand domain. | |
| """ | |
| count = 0 | |
| for a in soup.find_all("a", href=True): | |
| href = str(a["href"]).strip() | |
| visible_text = a.get_text(strip=True) | |
| if not href or not visible_text: | |
| continue | |
| if not re.match(r"https?://", href): | |
| continue | |
| # Case 1: href is a raw IP but visible text looks like a domain | |
| if re.match(r"https?://(?:\d{1,3}\.){3}\d{1,3}", href): | |
| if re.search(r"[a-zA-Z]{3,}\.[a-zA-Z]{2,}", visible_text): | |
| count += 1 | |
| continue | |
| # Case 2: domain in href ≠ domain in visible text | |
| if not re.search(r"[a-zA-Z0-9][.-][a-zA-Z]{2,}", visible_text): | |
| continue | |
| try: | |
| href_domain = urlparse(href).netloc.lower().lstrip("www.") | |
| text_domain = re.search(r"[\w-]+\.[a-zA-Z]{2,}", visible_text) | |
| if text_domain: | |
| text_d = text_domain.group(0).lower().lstrip("www.") | |
| if href_domain and text_d and href_domain != text_d: | |
| count += 1 | |
| except Exception: | |
| pass | |
| return count | |
| def _has_external_form_action(soup: BeautifulSoup) -> bool: | |
| """Detect forms that POST to a different domain than the email sender. | |
| Security rationale: Credential-harvesting forms in phishing emails | |
| POST login data to attacker-controlled servers. Any form action | |
| pointing to an external URL is a strong indicator. | |
| """ | |
| forms = soup.find_all("form") | |
| for form in forms: | |
| action = form.get("action", "") | |
| if action and re.match(r"https?://", action): | |
| return True # External form action found | |
| return False | |
| def _count_hidden_text_elements(soup: BeautifulSoup) -> int: | |
| """Count HTML elements that visually hide text using CSS tricks. | |
| Security rationale: Hidden white-on-white text, zero-font-size content, | |
| and display:none elements are used to stuff keywords that evade spam | |
| filters while remaining invisible to human readers. | |
| """ | |
| count = 0 | |
| for element in soup.find_all(style=True): | |
| style = element.get("style", "") | |
| for pattern in _HIDDEN_TEXT_PATTERNS: | |
| if pattern.search(style): | |
| count += 1 | |
| break | |
| # Also check elements with the 'hidden' attribute | |
| count += len(soup.find_all(hidden=True)) | |
| return count | |
| def _compute_image_to_text_ratio(soup: BeautifulSoup) -> float: | |
| """Compute ratio of img tags to total word count. | |
| Security rationale: Pure-image phishing emails contain no analysable text | |
| by design — the phishing content is baked into images to evade text-based | |
| filters. A high image-to-text ratio is a strong phishing signal. | |
| """ | |
| img_count = len(soup.find_all("img")) | |
| word_count = len(soup.get_text().split()) | |
| if word_count == 0: | |
| return float(img_count) # All images, no text | |
| return img_count / word_count | |
| def _count_tracking_pixels(soup: BeautifulSoup) -> int: | |
| """Count 1×1 tracking pixel images. | |
| Security rationale: Tracking pixels confirm delivery to a live email | |
| address. Phishers use them to validate target lists and time follow-up attacks. | |
| """ | |
| count = 0 | |
| for img in soup.find_all("img"): | |
| width = img.get("width", "") | |
| height = img.get("height", "") | |
| src = img.get("src", "") | |
| # 1x1 pixel images | |
| if (str(width) == "1" and str(height) == "1") or "tracking" in src.lower(): | |
| count += 1 | |
| return count | |
| def _count_base64_content(soup: BeautifulSoup) -> int: | |
| """Count inline base64-encoded content (images, scripts, etc.). | |
| Security rationale: Base64-encoded content embedded directly in HTML | |
| bypasses URL-based phishing filters entirely. Legitimate email rarely | |
| uses inline base64 for anything other than small icons. | |
| """ | |
| html_str = str(soup) | |
| return len(_BASE64_DATA_RE.findall(html_str)) | |
| def _count_external_css(soup: BeautifulSoup) -> int: | |
| """Count externally loaded CSS stylesheets. | |
| Security rationale: External CSS can be used to dynamically alter the | |
| appearance of email after delivery (e.g., hiding/showing content based | |
| on when it is opened — a sign of delayed activation phishing). | |
| """ | |
| count = 0 | |
| for link in soup.find_all("link", rel=True): | |
| if "stylesheet" in str(link.get("rel", [])).lower(): | |
| href = link.get("href", "") | |
| if href.startswith("http"): | |
| count += 1 | |
| return count | |
| def _extract_link_domain(href: str) -> Optional[str]: | |
| """Extract the registered domain from an href value.""" | |
| try: | |
| if not href.startswith("http"): | |
| return None | |
| ext = tldextract.extract(href) | |
| return ext.top_domain_under_public_suffix or None | |
| except Exception: | |
| return None | |
| def _has_meta_refresh(soup: BeautifulSoup) -> bool: | |
| """Detect meta refresh redirect tags. | |
| Meta refresh is used by phishers to redirect victims to a malicious | |
| page after a short delay, often with a blank/loading placeholder page | |
| shown first to evade automated scanners. | |
| """ | |
| for meta in soup.find_all("meta"): | |
| http_equiv = meta.get("http-equiv", "").lower() | |
| content = meta.get("content", "").lower() | |
| if http_equiv == "refresh" and "url=" in content: | |
| return True | |
| return False | |
| def _default_html_features() -> Dict: | |
| """Return zero-value defaults for all HTML features.""" | |
| return { | |
| "href_text_mismatch_count": 0, | |
| "external_form_action": 0, | |
| "hidden_text_count": 0, | |
| "image_to_text_ratio": 0.0, | |
| "tracking_pixel_count": 0, | |
| "base64_content_count": 0, | |
| "javascript_count": 0, | |
| "external_css_count": 0, | |
| "total_links": 0, | |
| "unique_domains_in_links": 0, | |
| "link_domain_diversity": 0.0, | |
| } | |