Spaces:
Runtime error
Runtime error
| """ | |
| Shared Feature Engineering for HTML-based Phishing Detection | |
| Creates derived features from raw HTML features to improve model performance. | |
| Used by both XGBoost and Random Forest training pipelines. | |
| """ | |
| import numpy as np | |
| import pandas as pd | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def engineer_features(X: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Create engineered features from raw HTML features. | |
| Adds ratio features, interaction terms and risk scores | |
| that capture phishing-specific patterns. | |
| Args: | |
| X: DataFrame with raw feature columns (no 'label'/'filename') | |
| Returns: | |
| DataFrame with original + engineered features (inf replaced by 0) | |
| """ | |
| X = X.copy() | |
| # ---- Ratio features (division guarded by +1) ---- | |
| X['forms_to_inputs_ratio'] = X['num_forms'] / (X['num_input_fields'] + 1) | |
| X['external_to_total_links'] = X['num_external_links'] / (X['num_links'] + 1) | |
| X['scripts_to_tags_ratio'] = X['num_scripts'] / (X['num_tags'] + 1) | |
| X['hidden_to_visible_inputs'] = X['num_hidden_fields'] / (X['num_input_fields'] + 1) | |
| X['password_to_inputs_ratio'] = X['num_password_fields'] / (X['num_input_fields'] + 1) | |
| X['empty_to_total_links'] = X['num_empty_links'] / (X['num_links'] + 1) | |
| X['images_to_tags_ratio'] = X['num_images'] / (X['num_tags'] + 1) | |
| X['iframes_to_tags_ratio'] = X['num_iframes'] / (X['num_tags'] + 1) | |
| # ---- Interaction features (suspicious combinations) ---- | |
| X['forms_with_passwords'] = X['num_forms'] * X['num_password_fields'] | |
| X['external_scripts_links'] = X['num_external_links'] * X['num_external_scripts'] | |
| X['urgency_with_forms'] = X['num_urgency_keywords'] * X['num_forms'] | |
| X['brand_with_forms'] = X['num_brand_mentions'] * X['num_forms'] | |
| X['iframes_with_scripts'] = X['num_iframes'] * X['num_scripts'] | |
| X['hidden_with_external'] = X['num_hidden_fields'] * X['num_external_form_actions'] | |
| # ---- Content density features ---- | |
| X['content_density'] = (X['text_length'] + 1) / (X['num_divs'] + X['num_spans'] + 1) | |
| X['form_density'] = X['num_forms'] / (X['num_divs'] + 1) | |
| X['scripts_per_form'] = X['num_scripts'] / (X['num_forms'] + 1) | |
| X['links_per_word'] = X['num_links'] / (X['num_words'] + 1) | |
| # ---- Risk scores ---- | |
| X['phishing_risk_score'] = ( | |
| X['num_urgency_keywords'] * 2 + | |
| X['num_brand_mentions'] * 2 + | |
| X['num_password_fields'] * 3 + | |
| X['num_iframes'] * 2 + | |
| X.get('num_hidden_iframes', 0) * 4 + | |
| X.get('num_anchor_text_mismatch', 0) * 3 + | |
| X.get('num_suspicious_tld_links', 0) * 2 + | |
| X.get('has_login_form', 0) * 3 | |
| ) | |
| X['form_risk_score'] = ( | |
| X['num_password_fields'] * 3 + | |
| X['num_external_form_actions'] * 2 + | |
| X['num_empty_form_actions'] + | |
| X['num_hidden_fields'] | |
| ) | |
| X['obfuscation_score'] = ( | |
| X['has_eval'] + | |
| X['has_unescape'] + | |
| X['has_escape'] + | |
| X['has_document_write'] + | |
| X.get('has_base64', 0) + | |
| X.get('has_atob', 0) + | |
| X.get('has_fromcharcode', 0) | |
| ) | |
| X['legitimacy_score'] = ( | |
| X['has_title'] + | |
| X.get('has_description', 0) + | |
| X.get('has_viewport', 0) + | |
| X.get('has_favicon', 0) + | |
| X.get('has_copyright', 0) + | |
| X.get('has_author', 0) + | |
| (X['num_meta_tags'] > 3).astype(int) + | |
| (X['num_css_files'] > 0).astype(int) | |
| ) | |
| # ---- Boolean aggregations ---- | |
| X['has_suspicious_elements'] = ( | |
| (X.get('has_meta_refresh', 0) == 1) | | |
| (X['num_iframes'] > 0) | | |
| (X['num_hidden_fields'] > 3) | | |
| (X.get('has_location_replace', 0) == 1) | |
| ).astype(int) | |
| # ---- Clean up ---- | |
| X = X.replace([np.inf, -np.inf], 0) | |
| X = X.fillna(0) | |
| return X | |
| def get_engineered_feature_names() -> list[str]: | |
| """Return names of features added by engineer_features().""" | |
| return [ | |
| # Ratios (8) | |
| 'forms_to_inputs_ratio', 'external_to_total_links', | |
| 'scripts_to_tags_ratio', 'hidden_to_visible_inputs', | |
| 'password_to_inputs_ratio', 'empty_to_total_links', | |
| 'images_to_tags_ratio', 'iframes_to_tags_ratio', | |
| # Interactions (6) | |
| 'forms_with_passwords', 'external_scripts_links', | |
| 'urgency_with_forms', 'brand_with_forms', | |
| 'iframes_with_scripts', 'hidden_with_external', | |
| # Content density (4) | |
| 'content_density', 'form_density', 'scripts_per_form', 'links_per_word', | |
| # Risk scores (4) | |
| 'phishing_risk_score', 'form_risk_score', | |
| 'obfuscation_score', 'legitimacy_score', | |
| # Boolean (1) | |
| 'has_suspicious_elements', | |
| ] | |