File size: 11,297 Bytes
0fd143d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
"""
PhishLens HTML Structural Anomaly Feature Module.

Extracts 11 features from the HTML body of emails by parsing with BeautifulSoup.
Phishing emails rely heavily on HTML tricks to hide malicious content, redirect
users, and harvest credentials.

Security rationale: HTML-based obfuscation is a primary evasion technique.
Hidden text (display:none), form POST to attacker-controlled domains, and
href/visible-text mismatches are reliable signals that cannot be faked without
triggering feature flags. These features complement NLP features which only
see the rendered/visible text.
"""

from __future__ import annotations

import re
from typing import Dict, List, Optional
from urllib.parse import urlparse

from bs4 import BeautifulSoup
import tldextract

from src.utils.logger import get_logger

# Truncate HTML before BeautifulSoup to prevent exponential parse time on
# monster HTML emails (multi-MB newsletters, base64-inlined images, etc.).
# All structural signals (links, forms, hidden elements) appear early in the
# document so truncating at 200 KB does not materially affect feature quality.
_MAX_HTML_CHARS = 200_000  # 200 KB

log = get_logger(__name__)

# Hidden text CSS patterns — all known obfuscation techniques
_HIDDEN_TEXT_PATTERNS = [
    re.compile(r"display\s*:\s*none", re.IGNORECASE),
    re.compile(r"font-size\s*:\s*0", re.IGNORECASE),
    re.compile(r"color\s*:\s*(white|#fff|#ffffff|rgba\(255,255,255)", re.IGNORECASE),
    re.compile(r"visibility\s*:\s*hidden", re.IGNORECASE),
    re.compile(r"opacity\s*:\s*0(?!\.\d)", re.IGNORECASE),
    # Modern phishing CSS obfuscation techniques:
    re.compile(r"height\s*:\s*0px|height\s*:\s*0;", re.IGNORECASE),
    re.compile(r"max-height\s*:\s*0", re.IGNORECASE),
    re.compile(r"overflow\s*:\s*hidden", re.IGNORECASE),
    re.compile(r"text-indent\s*:\s*-\d{3,}", re.IGNORECASE),
    re.compile(r"clip\s*:\s*rect\s*\(\s*0", re.IGNORECASE),
    re.compile(r"mso-hide\s*:\s*all", re.IGNORECASE),  # Outlook-specific hiding
]

# Base64 data URI pattern
_BASE64_DATA_RE = re.compile(r"data:[^;]+;base64,", re.IGNORECASE)

# Tracking pixel pattern (1x1 images)
_TRACKING_PIXEL_RE = re.compile(r'(width|height)\s*[=:]\s*["\']?1["\']?', re.IGNORECASE)


def extract_html_features(html_body: str) -> Dict:
    """Extract 11 HTML structural anomaly features from an email HTML body.

    Args:
        html_body: Raw HTML string from the email body.

    Returns:
        Dict with 11 numeric HTML features. Returns zero defaults if html_body
        is empty or unparseable.
    """
    if not html_body or not html_body.strip():
        return _default_html_features()

    # Truncate oversized HTML to keep parse time bounded.
    if len(html_body) > _MAX_HTML_CHARS:
        html_body = html_body[:_MAX_HTML_CHARS]

    features = _default_html_features()

    try:
        soup = BeautifulSoup(html_body, "lxml")
    except Exception as exc:
        log.debug(f"BeautifulSoup parse error: {exc}")
        try:
            soup = BeautifulSoup(html_body, "html.parser")
        except Exception:
            return features

    try:
        features["href_text_mismatch_count"] = _count_href_text_mismatches(soup)
    except Exception as exc:
        log.debug(f"href_text_mismatch_count error: {exc}")

    try:
        features["external_form_action"] = int(_has_external_form_action(soup))
    except Exception as exc:
        log.debug(f"external_form_action error: {exc}")

    try:
        features["hidden_text_count"] = _count_hidden_text_elements(soup)
    except Exception as exc:
        log.debug(f"hidden_text_count error: {exc}")

    try:
        features["image_to_text_ratio"] = _compute_image_to_text_ratio(soup)
    except Exception as exc:
        log.debug(f"image_to_text_ratio error: {exc}")

    try:
        features["tracking_pixel_count"] = _count_tracking_pixels(soup)
    except Exception as exc:
        log.debug(f"tracking_pixel_count error: {exc}")

    try:
        features["base64_content_count"] = _count_base64_content(soup)
    except Exception as exc:
        log.debug(f"base64_content_count error: {exc}")

    try:
        features["javascript_count"] = len(soup.find_all("script"))
    except Exception as exc:
        log.debug(f"javascript_count error: {exc}")

    try:
        features["external_css_count"] = _count_external_css(soup)
    except Exception as exc:
        log.debug(f"external_css_count error: {exc}")

    try:
        links = soup.find_all("a", href=True)
        features["total_links"] = len(links)
        domains = [_extract_link_domain(a["href"]) for a in links]
        domains = [d for d in domains if d]
        unique_domains = set(domains)
        features["unique_domains_in_links"] = len(unique_domains)
        if features["total_links"] > 0:
            features["link_domain_diversity"] = len(unique_domains) / features["total_links"]
    except Exception as exc:
        log.debug(f"link domain features error: {exc}")

    return features


# ---------------------------------------------------------------------------
# Feature implementations
# ---------------------------------------------------------------------------


def _count_href_text_mismatches(soup: BeautifulSoup) -> int:
    """Count anchor tags where visible text ≠ href URL.

    Also catches the IP-in-href trick: href points to a raw IP address
    but visible text shows a legitimate brand domain.
    """
    count = 0
    for a in soup.find_all("a", href=True):
        href = str(a["href"]).strip()
        visible_text = a.get_text(strip=True)

        if not href or not visible_text:
            continue

        if not re.match(r"https?://", href):
            continue

        # Case 1: href is a raw IP but visible text looks like a domain
        if re.match(r"https?://(?:\d{1,3}\.){3}\d{1,3}", href):
            if re.search(r"[a-zA-Z]{3,}\.[a-zA-Z]{2,}", visible_text):
                count += 1
                continue

        # Case 2: domain in href ≠ domain in visible text
        if not re.search(r"[a-zA-Z0-9][.-][a-zA-Z]{2,}", visible_text):
            continue
        try:
            href_domain = urlparse(href).netloc.lower().lstrip("www.")
            text_domain = re.search(r"[\w-]+\.[a-zA-Z]{2,}", visible_text)
            if text_domain:
                text_d = text_domain.group(0).lower().lstrip("www.")
                if href_domain and text_d and href_domain != text_d:
                    count += 1
        except Exception:
            pass

    return count


def _has_external_form_action(soup: BeautifulSoup) -> bool:
    """Detect forms that POST to a different domain than the email sender.

    Security rationale: Credential-harvesting forms in phishing emails
    POST login data to attacker-controlled servers. Any form action
    pointing to an external URL is a strong indicator.
    """
    forms = soup.find_all("form")
    for form in forms:
        action = form.get("action", "")
        if action and re.match(r"https?://", action):
            return True     # External form action found
    return False


def _count_hidden_text_elements(soup: BeautifulSoup) -> int:
    """Count HTML elements that visually hide text using CSS tricks.

    Security rationale: Hidden white-on-white text, zero-font-size content,
    and display:none elements are used to stuff keywords that evade spam
    filters while remaining invisible to human readers.
    """
    count = 0
    for element in soup.find_all(style=True):
        style = element.get("style", "")
        for pattern in _HIDDEN_TEXT_PATTERNS:
            if pattern.search(style):
                count += 1
                break
    # Also check elements with the 'hidden' attribute
    count += len(soup.find_all(hidden=True))
    return count


def _compute_image_to_text_ratio(soup: BeautifulSoup) -> float:
    """Compute ratio of img tags to total word count.

    Security rationale: Pure-image phishing emails contain no analysable text
    by design — the phishing content is baked into images to evade text-based
    filters. A high image-to-text ratio is a strong phishing signal.
    """
    img_count = len(soup.find_all("img"))
    word_count = len(soup.get_text().split())
    if word_count == 0:
        return float(img_count)    # All images, no text
    return img_count / word_count


def _count_tracking_pixels(soup: BeautifulSoup) -> int:
    """Count 1×1 tracking pixel images.

    Security rationale: Tracking pixels confirm delivery to a live email
    address. Phishers use them to validate target lists and time follow-up attacks.
    """
    count = 0
    for img in soup.find_all("img"):
        width = img.get("width", "")
        height = img.get("height", "")
        src = img.get("src", "")
        # 1x1 pixel images
        if (str(width) == "1" and str(height) == "1") or "tracking" in src.lower():
            count += 1
    return count


def _count_base64_content(soup: BeautifulSoup) -> int:
    """Count inline base64-encoded content (images, scripts, etc.).

    Security rationale: Base64-encoded content embedded directly in HTML
    bypasses URL-based phishing filters entirely. Legitimate email rarely
    uses inline base64 for anything other than small icons.
    """
    html_str = str(soup)
    return len(_BASE64_DATA_RE.findall(html_str))


def _count_external_css(soup: BeautifulSoup) -> int:
    """Count externally loaded CSS stylesheets.

    Security rationale: External CSS can be used to dynamically alter the
    appearance of email after delivery (e.g., hiding/showing content based
    on when it is opened — a sign of delayed activation phishing).
    """
    count = 0
    for link in soup.find_all("link", rel=True):
        if "stylesheet" in str(link.get("rel", [])).lower():
            href = link.get("href", "")
            if href.startswith("http"):
                count += 1
    return count


def _extract_link_domain(href: str) -> Optional[str]:
    """Extract the registered domain from an href value."""
    try:
        if not href.startswith("http"):
            return None
        ext = tldextract.extract(href)
        return ext.top_domain_under_public_suffix or None
    except Exception:
        return None


def _has_meta_refresh(soup: BeautifulSoup) -> bool:
    """Detect meta refresh redirect tags.

    Meta refresh is used by phishers to redirect victims to a malicious
    page after a short delay, often with a blank/loading placeholder page
    shown first to evade automated scanners.
    """
    for meta in soup.find_all("meta"):
        http_equiv = meta.get("http-equiv", "").lower()
        content = meta.get("content", "").lower()
        if http_equiv == "refresh" and "url=" in content:
            return True
    return False


def _default_html_features() -> Dict:
    """Return zero-value defaults for all HTML features."""
    return {
        "href_text_mismatch_count": 0,
        "external_form_action": 0,
        "hidden_text_count": 0,
        "image_to_text_ratio": 0.0,
        "tracking_pixel_count": 0,
        "base64_content_count": 0,
        "javascript_count": 0,
        "external_css_count": 0,
        "total_links": 0,
        "unique_domains_in_links": 0,
        "link_domain_diversity": 0.0,
    }