""" Optimized HTML Feature Extractor for Phishing Detection Extracts 67 features from HTML content with single-parse efficiency. Uses cached tag lookups to avoid redundant find_all() calls. """ import re from urllib.parse import urlparse from bs4 import BeautifulSoup import logging logger = logging.getLogger(__name__) # Suspicious TLDs commonly used in phishing SUSPICIOUS_TLDS = { '.tk', '.ml', '.ga', '.cf', '.gq', '.top', '.xyz', '.buzz', '.club', '.online', '.site', '.icu', '.work', '.click', '.link', '.info', '.pw', '.cc', '.ws', '.bid', '.stream', '.racing', } # Brand keywords phishers commonly impersonate BRAND_KEYWORDS = [ 'paypal', 'amazon', 'google', 'microsoft', 'apple', 'facebook', 'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'yahoo', 'bank', 'visa', 'mastercard', 'americanexpress', 'chase', 'wells', 'citibank', 'dhl', 'fedex', 'ups', 'usps', 'dropbox', 'adobe', 'spotify', 'whatsapp', 'telegram', 'steam', 'coinbase', 'binance', ] # Urgency / social engineering keywords URGENCY_KEYWORDS = [ 'urgent', 'verify', 'suspended', 'locked', 'confirm', 'security', 'alert', 'warning', 'expire', 'limited', 'immediately', 'click here', 'act now', 'unusual activity', 'unauthorized', 'restricted', 'risk', 'compromised', 'your account', 'update your', 'verify your', 'confirm your', 'within 24', 'within 48', 'action required', ] class HTMLFeatureExtractor: """ High-performance HTML feature extractor. Parses HTML once and caches all tag lookups for efficiency. Designed for batch processing of 40k+ files. """ def extract_features(self, html_content: str, url: str | None = None) -> dict: """ Extract all features from HTML content in a single pass. Args: html_content: Raw HTML string url: Optional source URL for context Returns: Dictionary with 67 numeric features """ try: # --- Single parse with fast parser --- try: soup = BeautifulSoup(html_content, 'lxml') except Exception: soup = BeautifulSoup(html_content, 'html.parser') # --- Cache tag lookups (done ONCE) --- cache = self._build_cache(soup) features = {} features.update(self._structure_features(soup, cache, html_content)) features.update(self._form_features(cache)) features.update(self._link_features(cache)) features.update(self._script_features(cache)) features.update(self._text_features(soup, cache)) features.update(self._meta_features(soup, cache)) features.update(self._resource_features(cache)) features.update(self._advanced_features(soup, cache)) return features except Exception as e: logger.debug(f"Feature extraction error: {e}") return self._default_features() # ------------------------------------------------------------------ # Cache builder – avoids redundant find_all() across feature groups # ------------------------------------------------------------------ @staticmethod def _build_cache(soup) -> dict: """Build a lookup cache of all tags we need. Called once per document.""" all_tags = soup.find_all() # Classify tags by name in a single pass by_name: dict[str, list] = {} for tag in all_tags: by_name.setdefault(tag.name, []).append(tag) # Convenience lists used by multiple feature groups links_a = by_name.get('a', []) forms = by_name.get('form', []) inputs = by_name.get('input', []) scripts = by_name.get('script', []) images = by_name.get('img', []) iframes = by_name.get('iframe', []) meta_tags = by_name.get('meta', []) style_tags = by_name.get('style', []) css_links = [t for t in by_name.get('link', []) if t.get('rel') and 'stylesheet' in t.get('rel', [])] all_link_tags = by_name.get('link', []) # Pre-extract hrefs and input types (used in several groups) hrefs = [a.get('href', '') or '' for a in links_a] input_types = [(inp, (inp.get('type', '') or '').lower()) for inp in inputs] return { 'all_tags': all_tags, 'by_name': by_name, 'links_a': links_a, 'hrefs': hrefs, 'forms': forms, 'inputs': inputs, 'input_types': input_types, 'scripts': scripts, 'images': images, 'iframes': iframes, 'meta_tags': meta_tags, 'style_tags': style_tags, 'css_links': css_links, 'all_link_tags': all_link_tags, } # ------------------------------------------------------------------ # 1. Structure features (12) # ------------------------------------------------------------------ @staticmethod def _structure_features(soup, c: dict, raw_html: str) -> dict: bn = c['by_name'] # DOM depth – walk just the body = soup.find('body') max_depth = 0 if body: stack = [(body, 0)] while stack: node, depth = stack.pop() if depth > max_depth: max_depth = depth for child in getattr(node, 'children', []): if hasattr(child, 'name') and child.name: stack.append((child, depth + 1)) return { 'html_length': len(raw_html), 'num_tags': len(c['all_tags']), 'num_divs': len(bn.get('div', [])), 'num_spans': len(bn.get('span', [])), 'num_paragraphs': len(bn.get('p', [])), 'num_headings': sum(len(bn.get(h, [])) for h in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')), 'num_lists': len(bn.get('ul', [])) + len(bn.get('ol', [])), 'num_images': len(c['images']), 'num_iframes': len(c['iframes']), 'num_tables': len(bn.get('table', [])), 'has_title': 1 if soup.find('title') else 0, 'dom_depth': max_depth, } # ------------------------------------------------------------------ # 2. Form features (11) # ------------------------------------------------------------------ @staticmethod def _form_features(c: dict) -> dict: forms = c['forms'] input_types = c['input_types'] n_password = sum(1 for _, t in input_types if t == 'password') n_email = sum(1 for _, t in input_types if t == 'email') n_text = sum(1 for _, t in input_types if t == 'text') n_hidden = sum(1 for _, t in input_types if t == 'hidden') n_submit = sum(1 for _, t in input_types if t == 'submit') # Also count