Spaces:
Runtime error
Runtime error
| """ | |
| HTML Feature Extractor for Phishing Detection | |
| Extracts ~50 features from HTML content including forms, links, scripts, etc. | |
| """ | |
| import re | |
| from pathlib import Path | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse | |
| import pandas as pd | |
| import numpy as np | |
| class HTMLFeatureExtractor: | |
| """Extract features from HTML content for phishing detection.""" | |
| def __init__(self): | |
| # Common legitimate brand keywords | |
| self.brand_keywords = [ | |
| 'paypal', 'amazon', 'google', 'microsoft', 'apple', 'facebook', | |
| 'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'yahoo', | |
| 'bank', 'visa', 'mastercard', 'americanexpress', 'chase', 'wells', | |
| 'citibank', 'dhl', 'fedex', 'ups', 'usps' | |
| ] | |
| # Urgency/phishing keywords | |
| self.urgency_keywords = [ | |
| 'urgent', 'verify', 'account', 'suspended', 'locked', 'confirm', | |
| 'update', 'security', 'alert', 'warning', 'expire', 'limited', | |
| 'immediately', 'click here', 'act now', 'suspended', 'unusual', | |
| 'unauthorized', 'restricted' | |
| ] | |
| def extract_features(self, html_content, url=None): | |
| """ | |
| Extract all HTML features from content. | |
| Args: | |
| html_content: HTML string content | |
| url: Optional URL for additional context | |
| Returns: | |
| Dictionary of features | |
| """ | |
| features = {} | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Basic structure features | |
| features.update(self._extract_structure_features(soup)) | |
| # Form features | |
| features.update(self._extract_form_features(soup)) | |
| # Link features | |
| features.update(self._extract_link_features(soup, url)) | |
| # Script features | |
| features.update(self._extract_script_features(soup)) | |
| # Text content features | |
| features.update(self._extract_text_features(soup)) | |
| # Meta tag features | |
| features.update(self._extract_meta_features(soup)) | |
| # External resource features | |
| features.update(self._extract_resource_features(soup, url)) | |
| # Advanced phishing indicators | |
| features.update(self._extract_advanced_features(soup)) | |
| except Exception as e: | |
| print(f"Error extracting features: {e}") | |
| # Return default features on error | |
| features = self._get_default_features() | |
| return features | |
| def _extract_structure_features(self, soup): | |
| """Extract basic HTML structure features.""" | |
| return { | |
| 'html_length': len(str(soup)), | |
| 'num_tags': len(soup.find_all()), | |
| 'num_divs': len(soup.find_all('div')), | |
| 'num_spans': len(soup.find_all('span')), | |
| 'num_paragraphs': len(soup.find_all('p')), | |
| 'num_headings': len(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])), | |
| 'num_lists': len(soup.find_all(['ul', 'ol'])), | |
| 'num_images': len(soup.find_all('img')), | |
| 'num_iframes': len(soup.find_all('iframe')), | |
| 'num_tables': len(soup.find_all('table')), | |
| 'has_title': 1 if soup.find('title') else 0, | |
| } | |
| def _extract_form_features(self, soup): | |
| """Extract form-related features.""" | |
| forms = soup.find_all('form') | |
| features = { | |
| 'num_forms': len(forms), | |
| 'num_input_fields': len(soup.find_all('input')), | |
| 'num_password_fields': len(soup.find_all('input', {'type': 'password'})), | |
| 'num_email_fields': len(soup.find_all('input', {'type': 'email'})), | |
| 'num_text_fields': len(soup.find_all('input', {'type': 'text'})), | |
| 'num_submit_buttons': len(soup.find_all(['input', 'button'], {'type': 'submit'})), | |
| 'num_hidden_fields': len(soup.find_all('input', {'type': 'hidden'})), | |
| 'has_form': 1 if forms else 0, | |
| } | |
| # Check form actions | |
| if forms: | |
| form_actions = [form.get('action', '') for form in forms] | |
| features['num_external_form_actions'] = sum(1 for action in form_actions | |
| if action.startswith('http')) | |
| features['num_empty_form_actions'] = sum(1 for action in form_actions | |
| if not action or action == '#') | |
| else: | |
| features['num_external_form_actions'] = 0 | |
| features['num_empty_form_actions'] = 0 | |
| return features | |
| def _extract_link_features(self, soup, url=None): | |
| """Extract link-related features.""" | |
| links = soup.find_all('a') | |
| hrefs = [link.get('href', '') for link in links] | |
| features = { | |
| 'num_links': len(links), | |
| 'num_external_links': sum(1 for href in hrefs if href.startswith('http')), | |
| 'num_internal_links': sum(1 for href in hrefs if href.startswith('/') or href.startswith('#')), | |
| 'num_empty_links': sum(1 for href in hrefs if not href or href == '#'), | |
| 'num_mailto_links': sum(1 for href in hrefs if href.startswith('mailto:')), | |
| 'num_javascript_links': sum(1 for href in hrefs if 'javascript:' in href.lower()), | |
| } | |
| # Calculate ratio of external links | |
| if features['num_links'] > 0: | |
| features['ratio_external_links'] = features['num_external_links'] / features['num_links'] # type: ignore | |
| else: | |
| features['ratio_external_links'] = 0 | |
| # Check for suspicious link patterns | |
| features['num_ip_based_links'] = sum(1 for href in hrefs | |
| if re.search(r'http://\d+\.\d+\.\d+\.\d+', href)) | |
| return features | |
| def _extract_script_features(self, soup): | |
| """Extract JavaScript/script features.""" | |
| scripts = soup.find_all('script') | |
| features = { | |
| 'num_scripts': len(scripts), | |
| 'num_inline_scripts': sum(1 for script in scripts if script.string), | |
| 'num_external_scripts': sum(1 for script in scripts if script.get('src')), | |
| } | |
| # Check for suspicious script patterns | |
| script_content = ' '.join([script.string for script in scripts if script.string]) | |
| features['has_eval'] = 1 if 'eval(' in script_content else 0 | |
| features['has_unescape'] = 1 if 'unescape(' in script_content else 0 | |
| features['has_escape'] = 1 if 'escape(' in script_content else 0 | |
| features['has_document_write'] = 1 if 'document.write' in script_content else 0 | |
| return features | |
| def _extract_text_features(self, soup): | |
| """Extract text content features.""" | |
| # Get all visible text | |
| text = soup.get_text(separator=' ', strip=True).lower() | |
| features = { | |
| 'text_length': len(text), | |
| 'num_words': len(text.split()), | |
| } | |
| # Check for brand mentions | |
| features['num_brand_mentions'] = sum(1 for brand in self.brand_keywords | |
| if brand in text) | |
| # Check for urgency keywords | |
| features['num_urgency_keywords'] = sum(1 for keyword in self.urgency_keywords | |
| if keyword in text) | |
| # Check for specific patterns | |
| features['has_copyright'] = 1 if '©' in text or 'copyright' in text else 0 | |
| features['has_phone_number'] = 1 if re.search(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text) else 0 | |
| features['has_email'] = 1 if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) else 0 | |
| return features | |
| def _extract_meta_features(self, soup): | |
| """Extract meta tag features.""" | |
| meta_tags = soup.find_all('meta') | |
| features = { | |
| 'num_meta_tags': len(meta_tags), | |
| 'has_description': 1 if soup.find('meta', {'name': 'description'}) else 0, | |
| 'has_keywords': 1 if soup.find('meta', {'name': 'keywords'}) else 0, | |
| 'has_author': 1 if soup.find('meta', {'name': 'author'}) else 0, | |
| 'has_viewport': 1 if soup.find('meta', {'name': 'viewport'}) else 0, | |
| } | |
| # Check for refresh meta tag (often used in phishing) | |
| refresh_meta = soup.find('meta', {'http-equiv': 'refresh'}) | |
| features['has_meta_refresh'] = 1 if refresh_meta else 0 | |
| return features | |
| def _extract_resource_features(self, soup, url=None): | |
| """Extract external resource features.""" | |
| # CSS links | |
| css_links = soup.find_all('link', {'rel': 'stylesheet'}) | |
| # Images | |
| images = soup.find_all('img') | |
| img_srcs = [img.get('src', '') for img in images] | |
| # Inline styles | |
| inline_style_tags = soup.find_all('style') | |
| inline_style_content = ''.join([tag.string or '' for tag in inline_style_tags]) | |
| features = { | |
| 'num_css_files': len(css_links), | |
| 'num_external_css': sum(1 for link in css_links | |
| if link.get('href', '').startswith('http')), | |
| 'num_external_images': sum(1 for src in img_srcs if src.startswith('http')), | |
| 'num_data_uri_images': sum(1 for src in img_srcs if src.startswith('data:')), | |
| 'num_inline_styles': len(inline_style_tags), | |
| 'inline_css_length': len(inline_style_content), | |
| 'has_favicon': 1 if soup.find('link', {'rel': 'icon'}) or soup.find('link', {'rel': 'shortcut icon'}) else 0, | |
| } | |
| return features | |
| def _extract_advanced_features(self, soup): | |
| """Extract advanced phishing indicators.""" | |
| features = {} | |
| # Suspicious element combinations | |
| has_password = len(soup.find_all('input', {'type': 'password'})) > 0 | |
| has_external_action = any( | |
| form.get('action', '').startswith('http') | |
| for form in soup.find_all('form') | |
| ) | |
| features['password_with_external_action'] = 1 if (has_password and has_external_action) else 0 | |
| # Obfuscation indicators | |
| all_text = str(soup).lower() | |
| features['has_base64'] = 1 if 'base64' in all_text else 0 | |
| features['has_atob'] = 1 if 'atob(' in all_text else 0 | |
| features['has_fromcharcode'] = 1 if 'fromcharcode' in all_text else 0 | |
| # Suspicious attributes | |
| features['num_onload_events'] = len(soup.find_all(attrs={'onload': True})) | |
| features['num_onerror_events'] = len(soup.find_all(attrs={'onerror': True})) | |
| features['num_onclick_events'] = len(soup.find_all(attrs={'onclick': True})) | |
| # Domain analysis from links | |
| external_domains = set() | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| if href.startswith('http'): | |
| try: | |
| domain = urlparse(href).netloc | |
| if domain: | |
| external_domains.add(domain) | |
| except: | |
| pass | |
| features['num_unique_external_domains'] = len(external_domains) | |
| # Suspicious patterns in forms | |
| forms = soup.find_all('form') | |
| features['num_forms_without_labels'] = sum( | |
| 1 for form in forms | |
| if len(form.find_all('label')) == 0 and len(form.find_all('input')) > 0 | |
| ) | |
| # CSS visibility hiding (phishing technique) | |
| features['has_display_none'] = 1 if 'display:none' in all_text or 'display: none' in all_text else 0 | |
| features['has_visibility_hidden'] = 1 if 'visibility:hidden' in all_text or 'visibility: hidden' in all_text else 0 | |
| # Popup/redirect indicators | |
| features['has_window_open'] = 1 if 'window.open' in all_text else 0 | |
| features['has_location_replace'] = 1 if 'location.replace' in all_text or 'location.href' in all_text else 0 | |
| return features | |
| def _get_default_features(self): | |
| """Return dictionary with all features set to 0.""" | |
| return { | |
| # Structure features (11) | |
| 'html_length': 0, 'num_tags': 0, 'num_divs': 0, 'num_spans': 0, | |
| 'num_paragraphs': 0, 'num_headings': 0, 'num_lists': 0, 'num_images': 0, | |
| 'num_iframes': 0, 'num_tables': 0, 'has_title': 0, | |
| # Form features (8) | |
| 'num_forms': 0, 'num_input_fields': 0, 'num_password_fields': 0, | |
| 'num_email_fields': 0, 'num_text_fields': 0, 'num_submit_buttons': 0, | |
| 'num_hidden_fields': 0, 'has_form': 0, 'num_external_form_actions': 0, | |
| 'num_empty_form_actions': 0, | |
| # Link features (8) | |
| 'num_links': 0, 'num_external_links': 0, 'num_internal_links': 0, | |
| 'num_empty_links': 0, 'num_mailto_links': 0, 'num_javascript_links': 0, | |
| 'ratio_external_links': 0, 'num_ip_based_links': 0, | |
| # Script features (7) | |
| 'num_scripts': 0, 'num_inline_scripts': 0, 'num_external_scripts': 0, | |
| 'has_eval': 0, 'has_unescape': 0, 'has_escape': 0, 'has_document_write': 0, | |
| # Text features (7) | |
| 'text_length': 0, 'num_words': 0, 'num_brand_mentions': 0, | |
| 'num_urgency_keywords': 0, 'has_copyright': 0, 'has_phone_number': 0, | |
| 'has_email': 0, | |
| # Meta features (6) | |
| 'num_meta_tags': 0, 'num_css_files': 0, 'num_external_css': 0, 'num_external_images': 0, | |
| 'num_data_uri_images': 0, 'num_inline_styles': 0, 'inline_css_length': 0, | |
| 'has_favicon': 0, | |
| # Advanced phishing indicators (13) | |
| 'password_with_external_action': 0, 'has_base64': 0, 'has_atob': 0, | |
| 'has_fromcharcode': 0, 'num_onload_events': 0, 'num_onerror_events': 0, | |
| 'num_onclick_events': 0, 'num_unique_external_domains': 0, | |
| 'num_forms_without_labels': 0, 'has_display_none': 0, | |
| 'has_visibility_hidden': 0, 'has_window_open': 0, 'has_location_replace': 0, | |
| # Resource features (4) | |
| 'num_css_files': 0, 'num_external_css': 0, 'num_external_images': 0, | |
| 'num_data_uri_images': 0, | |
| } | |
| def get_feature_names(self): | |
| """Return list of all feature names.""" | |
| return list(self._get_default_features().keys()) | |
| def extract_features_from_file(html_file_path, url=None): | |
| """ | |
| Extract features from a single HTML file. | |
| Args: | |
| html_file_path: Path to HTML file | |
| url: Optional URL for context | |
| Returns: | |
| Dictionary of features | |
| """ | |
| extractor = HTMLFeatureExtractor() | |
| try: | |
| with open(html_file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| html_content = f.read() | |
| return extractor.extract_features(html_content, url) | |
| except Exception as e: | |
| print(f"Error reading file {html_file_path}: {e}") | |
| return extractor._get_default_features() | |
| if __name__ == '__main__': | |
| # Test with a sample HTML file | |
| import sys | |
| if len(sys.argv) > 1: | |
| html_file = sys.argv[1] | |
| features = extract_features_from_file(html_file) | |
| print(f"\nExtracted {len(features)} features from {html_file}:") | |
| print("-" * 80) | |
| for feature, value in features.items(): | |
| print(f"{feature:30s}: {value}") | |
| else: | |
| print("Usage: python html_features.py <html_file_path>") | |
| print("\nAvailable features:") | |
| extractor = HTMLFeatureExtractor() | |
| for i, feature in enumerate(extractor.get_feature_names(), 1): | |
| print(f"{i:2d}. {feature}") | |
| print(f"\nTotal: {len(extractor.get_feature_names())} features") | |