"""
Optimized HTML Feature Extractor for Phishing Detection
Extracts 67 features from HTML content with single-parse efficiency.
Uses cached tag lookups to avoid redundant find_all() calls.
"""
import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
# Suspicious TLDs commonly used in phishing
SUSPICIOUS_TLDS = {
'.tk', '.ml', '.ga', '.cf', '.gq', '.top', '.xyz', '.buzz',
'.club', '.online', '.site', '.icu', '.work', '.click', '.link',
'.info', '.pw', '.cc', '.ws', '.bid', '.stream', '.racing',
}
# Brand keywords phishers commonly impersonate
BRAND_KEYWORDS = [
'paypal', 'amazon', 'google', 'microsoft', 'apple', 'facebook',
'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'yahoo',
'bank', 'visa', 'mastercard', 'americanexpress', 'chase', 'wells',
'citibank', 'dhl', 'fedex', 'ups', 'usps', 'dropbox', 'adobe',
'spotify', 'whatsapp', 'telegram', 'steam', 'coinbase', 'binance',
]
# Urgency / social engineering keywords
URGENCY_KEYWORDS = [
'urgent', 'verify', 'suspended', 'locked', 'confirm',
'security', 'alert', 'warning', 'expire', 'limited',
'immediately', 'click here', 'act now', 'unusual activity',
'unauthorized', 'restricted', 'risk', 'compromised',
'your account', 'update your', 'verify your', 'confirm your',
'within 24', 'within 48', 'action required',
]
class HTMLFeatureExtractor:
"""
High-performance HTML feature extractor.
Parses HTML once and caches all tag lookups for efficiency.
Designed for batch processing of 40k+ files.
"""
def extract_features(self, html_content: str, url: str | None = None) -> dict:
"""
Extract all features from HTML content in a single pass.
Args:
html_content: Raw HTML string
url: Optional source URL for context
Returns:
Dictionary with 67 numeric features
"""
try:
# --- Single parse with fast parser ---
try:
soup = BeautifulSoup(html_content, 'lxml')
except Exception:
soup = BeautifulSoup(html_content, 'html.parser')
# --- Cache tag lookups (done ONCE) ---
cache = self._build_cache(soup)
features = {}
features.update(self._structure_features(soup, cache, html_content))
features.update(self._form_features(cache))
features.update(self._link_features(cache))
features.update(self._script_features(cache))
features.update(self._text_features(soup, cache))
features.update(self._meta_features(soup, cache))
features.update(self._resource_features(cache))
features.update(self._advanced_features(soup, cache))
return features
except Exception as e:
logger.debug(f"Feature extraction error: {e}")
return self._default_features()
# ------------------------------------------------------------------
# Cache builder – avoids redundant find_all() across feature groups
# ------------------------------------------------------------------
@staticmethod
def _build_cache(soup) -> dict:
"""Build a lookup cache of all tags we need. Called once per document."""
all_tags = soup.find_all()
# Classify tags by name in a single pass
by_name: dict[str, list] = {}
for tag in all_tags:
by_name.setdefault(tag.name, []).append(tag)
# Convenience lists used by multiple feature groups
links_a = by_name.get('a', [])
forms = by_name.get('form', [])
inputs = by_name.get('input', [])
scripts = by_name.get('script', [])
images = by_name.get('img', [])
iframes = by_name.get('iframe', [])
meta_tags = by_name.get('meta', [])
style_tags = by_name.get('style', [])
css_links = [t for t in by_name.get('link', [])
if t.get('rel') and 'stylesheet' in t.get('rel', [])]
all_link_tags = by_name.get('link', [])
# Pre-extract hrefs and input types (used in several groups)
hrefs = [a.get('href', '') or '' for a in links_a]
input_types = [(inp, (inp.get('type', '') or '').lower()) for inp in inputs]
return {
'all_tags': all_tags,
'by_name': by_name,
'links_a': links_a,
'hrefs': hrefs,
'forms': forms,
'inputs': inputs,
'input_types': input_types,
'scripts': scripts,
'images': images,
'iframes': iframes,
'meta_tags': meta_tags,
'style_tags': style_tags,
'css_links': css_links,
'all_link_tags': all_link_tags,
}
# ------------------------------------------------------------------
# 1. Structure features (12)
# ------------------------------------------------------------------
@staticmethod
def _structure_features(soup, c: dict, raw_html: str) -> dict:
bn = c['by_name']
# DOM depth – walk just the
body = soup.find('body')
max_depth = 0
if body:
stack = [(body, 0)]
while stack:
node, depth = stack.pop()
if depth > max_depth:
max_depth = depth
for child in getattr(node, 'children', []):
if hasattr(child, 'name') and child.name:
stack.append((child, depth + 1))
return {
'html_length': len(raw_html),
'num_tags': len(c['all_tags']),
'num_divs': len(bn.get('div', [])),
'num_spans': len(bn.get('span', [])),
'num_paragraphs': len(bn.get('p', [])),
'num_headings': sum(len(bn.get(h, []))
for h in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')),
'num_lists': len(bn.get('ul', [])) + len(bn.get('ol', [])),
'num_images': len(c['images']),
'num_iframes': len(c['iframes']),
'num_tables': len(bn.get('table', [])),
'has_title': 1 if soup.find('title') else 0,
'dom_depth': max_depth,
}
# ------------------------------------------------------------------
# 2. Form features (11)
# ------------------------------------------------------------------
@staticmethod
def _form_features(c: dict) -> dict:
forms = c['forms']
input_types = c['input_types']
n_password = sum(1 for _, t in input_types if t == 'password')
n_email = sum(1 for _, t in input_types if t == 'email')
n_text = sum(1 for _, t in input_types if t == 'text')
n_hidden = sum(1 for _, t in input_types if t == 'hidden')
n_submit = sum(1 for _, t in input_types if t == 'submit')
# Also count