Spaces:

SagarTony90265
/

PhishSentinel

Sleeping

PhishSentinel / src /features /html_features.py

github-actions[bot]

Deploy to HF Spaces (ci)

0fd143d 14 days ago

11.3 kB

	"""
	PhishLens HTML Structural Anomaly Feature Module.

	Extracts 11 features from the HTML body of emails by parsing with BeautifulSoup.
	Phishing emails rely heavily on HTML tricks to hide malicious content, redirect
	users, and harvest credentials.

	Security rationale: HTML-based obfuscation is a primary evasion technique.
	Hidden text (display:none), form POST to attacker-controlled domains, and
	href/visible-text mismatches are reliable signals that cannot be faked without
	triggering feature flags. These features complement NLP features which only
	see the rendered/visible text.
	"""

	from __future__ import annotations

	import re
	from typing import Dict, List, Optional
	from urllib.parse import urlparse

	from bs4 import BeautifulSoup
	import tldextract

	from src.utils.logger import get_logger

	# Truncate HTML before BeautifulSoup to prevent exponential parse time on
	# monster HTML emails (multi-MB newsletters, base64-inlined images, etc.).
	# All structural signals (links, forms, hidden elements) appear early in the
	# document so truncating at 200 KB does not materially affect feature quality.
	_MAX_HTML_CHARS = 200_000 # 200 KB

	log = get_logger(__name__)

	# Hidden text CSS patterns — all known obfuscation techniques
	_HIDDEN_TEXT_PATTERNS = [
	re.compile(r"display\s:\snone", re.IGNORECASE),
	re.compile(r"font-size\s:\s0", re.IGNORECASE),
	re.compile(r"color\s:\s(white\|#fff\|#ffffff\|rgba\(255,255,255)", re.IGNORECASE),
	re.compile(r"visibility\s:\shidden", re.IGNORECASE),
	re.compile(r"opacity\s:\s0(?!\.\d)", re.IGNORECASE),
	# Modern phishing CSS obfuscation techniques:
	re.compile(r"height\s:\s0px\|height\s:\s0;", re.IGNORECASE),
	re.compile(r"max-height\s:\s0", re.IGNORECASE),
	re.compile(r"overflow\s:\shidden", re.IGNORECASE),
	re.compile(r"text-indent\s:\s-\d{3,}", re.IGNORECASE),
	re.compile(r"clip\s:\srect\s\(\s0", re.IGNORECASE),
	re.compile(r"mso-hide\s:\sall", re.IGNORECASE), # Outlook-specific hiding
	]

	# Base64 data URI pattern
	_BASE64_DATA_RE = re.compile(r"data:[^;]+;base64,", re.IGNORECASE)

	# Tracking pixel pattern (1x1 images)
	_TRACKING_PIXEL_RE = re.compile(r'(width\|height)\s[=:]\s["\']?1["\']?', re.IGNORECASE)


	def extract_html_features(html_body: str) -> Dict:
	"""Extract 11 HTML structural anomaly features from an email HTML body.

	Args:
	html_body: Raw HTML string from the email body.

	Returns:
	Dict with 11 numeric HTML features. Returns zero defaults if html_body
	is empty or unparseable.
	"""
	if not html_body or not html_body.strip():
	return _default_html_features()

	# Truncate oversized HTML to keep parse time bounded.
	if len(html_body) > _MAX_HTML_CHARS:
	html_body = html_body[:_MAX_HTML_CHARS]

	features = _default_html_features()

	try:
	soup = BeautifulSoup(html_body, "lxml")
	except Exception as exc:
	log.debug(f"BeautifulSoup parse error: {exc}")
	try:
	soup = BeautifulSoup(html_body, "html.parser")
	except Exception:
	return features

	try:
	features["href_text_mismatch_count"] = _count_href_text_mismatches(soup)
	except Exception as exc:
	log.debug(f"href_text_mismatch_count error: {exc}")

	try:
	features["external_form_action"] = int(_has_external_form_action(soup))
	except Exception as exc:
	log.debug(f"external_form_action error: {exc}")

	try:
	features["hidden_text_count"] = _count_hidden_text_elements(soup)
	except Exception as exc:
	log.debug(f"hidden_text_count error: {exc}")

	try:
	features["image_to_text_ratio"] = _compute_image_to_text_ratio(soup)
	except Exception as exc:
	log.debug(f"image_to_text_ratio error: {exc}")

	try:
	features["tracking_pixel_count"] = _count_tracking_pixels(soup)
	except Exception as exc:
	log.debug(f"tracking_pixel_count error: {exc}")

	try:
	features["base64_content_count"] = _count_base64_content(soup)
	except Exception as exc:
	log.debug(f"base64_content_count error: {exc}")

	try:
	features["javascript_count"] = len(soup.find_all("script"))
	except Exception as exc:
	log.debug(f"javascript_count error: {exc}")

	try:
	features["external_css_count"] = _count_external_css(soup)
	except Exception as exc:
	log.debug(f"external_css_count error: {exc}")

	try:
	links = soup.find_all("a", href=True)
	features["total_links"] = len(links)
	domains = [_extract_link_domain(a["href"]) for a in links]
	domains = [d for d in domains if d]
	unique_domains = set(domains)
	features["unique_domains_in_links"] = len(unique_domains)
	if features["total_links"] > 0:
	features["link_domain_diversity"] = len(unique_domains) / features["total_links"]
	except Exception as exc:
	log.debug(f"link domain features error: {exc}")

	return features


	# ---------------------------------------------------------------------------
	# Feature implementations
	# ---------------------------------------------------------------------------


	def _count_href_text_mismatches(soup: BeautifulSoup) -> int:
	"""Count anchor tags where visible text ≠ href URL.

	Also catches the IP-in-href trick: href points to a raw IP address
	but visible text shows a legitimate brand domain.
	"""
	count = 0
	for a in soup.find_all("a", href=True):
	href = str(a["href"]).strip()
	visible_text = a.get_text(strip=True)

	if not href or not visible_text:
	continue

	if not re.match(r"https?://", href):
	continue

	# Case 1: href is a raw IP but visible text looks like a domain
	if re.match(r"https?://(?:\d{1,3}\.){3}\d{1,3}", href):
	if re.search(r"[a-zA-Z]{3,}\.[a-zA-Z]{2,}", visible_text):
	count += 1
	continue

	# Case 2: domain in href ≠ domain in visible text
	if not re.search(r"[a-zA-Z0-9][.-][a-zA-Z]{2,}", visible_text):
	continue
	try:
	href_domain = urlparse(href).netloc.lower().lstrip("www.")
	text_domain = re.search(r"[\w-]+\.[a-zA-Z]{2,}", visible_text)
	if text_domain:
	text_d = text_domain.group(0).lower().lstrip("www.")
	if href_domain and text_d and href_domain != text_d:
	count += 1
	except Exception:
	pass

	return count


	def _has_external_form_action(soup: BeautifulSoup) -> bool:
	"""Detect forms that POST to a different domain than the email sender.

	Security rationale: Credential-harvesting forms in phishing emails
	POST login data to attacker-controlled servers. Any form action
	pointing to an external URL is a strong indicator.
	"""
	forms = soup.find_all("form")
	for form in forms:
	action = form.get("action", "")
	if action and re.match(r"https?://", action):
	return True # External form action found
	return False


	def _count_hidden_text_elements(soup: BeautifulSoup) -> int:
	"""Count HTML elements that visually hide text using CSS tricks.

	Security rationale: Hidden white-on-white text, zero-font-size content,
	and display:none elements are used to stuff keywords that evade spam
	filters while remaining invisible to human readers.
	"""
	count = 0
	for element in soup.find_all(style=True):
	style = element.get("style", "")
	for pattern in _HIDDEN_TEXT_PATTERNS:
	if pattern.search(style):
	count += 1
	break
	# Also check elements with the 'hidden' attribute
	count += len(soup.find_all(hidden=True))
	return count


	def _compute_image_to_text_ratio(soup: BeautifulSoup) -> float:
	"""Compute ratio of img tags to total word count.

	Security rationale: Pure-image phishing emails contain no analysable text
	by design — the phishing content is baked into images to evade text-based
	filters. A high image-to-text ratio is a strong phishing signal.
	"""
	img_count = len(soup.find_all("img"))
	word_count = len(soup.get_text().split())
	if word_count == 0:
	return float(img_count) # All images, no text
	return img_count / word_count


	def _count_tracking_pixels(soup: BeautifulSoup) -> int:
	"""Count 1×1 tracking pixel images.

	Security rationale: Tracking pixels confirm delivery to a live email
	address. Phishers use them to validate target lists and time follow-up attacks.
	"""
	count = 0
	for img in soup.find_all("img"):
	width = img.get("width", "")
	height = img.get("height", "")
	src = img.get("src", "")
	# 1x1 pixel images
	if (str(width) == "1" and str(height) == "1") or "tracking" in src.lower():
	count += 1
	return count


	def _count_base64_content(soup: BeautifulSoup) -> int:
	"""Count inline base64-encoded content (images, scripts, etc.).

	Security rationale: Base64-encoded content embedded directly in HTML
	bypasses URL-based phishing filters entirely. Legitimate email rarely
	uses inline base64 for anything other than small icons.
	"""
	html_str = str(soup)
	return len(_BASE64_DATA_RE.findall(html_str))


	def _count_external_css(soup: BeautifulSoup) -> int:
	"""Count externally loaded CSS stylesheets.

	Security rationale: External CSS can be used to dynamically alter the
	appearance of email after delivery (e.g., hiding/showing content based
	on when it is opened — a sign of delayed activation phishing).
	"""
	count = 0
	for link in soup.find_all("link", rel=True):
	if "stylesheet" in str(link.get("rel", [])).lower():
	href = link.get("href", "")
	if href.startswith("http"):
	count += 1
	return count


	def _extract_link_domain(href: str) -> Optional[str]:
	"""Extract the registered domain from an href value."""
	try:
	if not href.startswith("http"):
	return None
	ext = tldextract.extract(href)
	return ext.top_domain_under_public_suffix or None
	except Exception:
	return None


	def _has_meta_refresh(soup: BeautifulSoup) -> bool:
	"""Detect meta refresh redirect tags.

	Meta refresh is used by phishers to redirect victims to a malicious
	page after a short delay, often with a blank/loading placeholder page
	shown first to evade automated scanners.
	"""
	for meta in soup.find_all("meta"):
	http_equiv = meta.get("http-equiv", "").lower()
	content = meta.get("content", "").lower()
	if http_equiv == "refresh" and "url=" in content:
	return True
	return False


	def _default_html_features() -> Dict:
	"""Return zero-value defaults for all HTML features."""
	return {
	"href_text_mismatch_count": 0,
	"external_form_action": 0,
	"hidden_text_count": 0,
	"image_to_text_ratio": 0.0,
	"tracking_pixel_count": 0,
	"base64_content_count": 0,
	"javascript_count": 0,
	"external_css_count": 0,
	"total_links": 0,
	"unique_domains_in_links": 0,
	"link_domain_diversity": 0.0,
	}