Spaces:
Runtime error
Runtime error
| """ | |
| Combined URL + HTML Feature Extraction from clean_dataset.csv | |
| Reads URLs from clean_dataset.csv, extracts URL features and downloads HTML | |
| to extract HTML features, combines them into a single feature dataset. | |
| Produces a balanced combined_features.csv. | |
| Usage: | |
| python scripts/feature_extraction/extract_combined_features.py | |
| python scripts/feature_extraction/extract_combined_features.py --workers 20 --timeout 15 | |
| python scripts/feature_extraction/extract_combined_features.py --limit 1000 --no-balance | |
| """ | |
| import argparse | |
| import logging | |
| import random | |
| import sys | |
| import time | |
| import warnings | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from pathlib import Path | |
| from threading import Lock | |
| import numpy as np | |
| import pandas as pd | |
| import requests | |
| import urllib3 | |
| from tqdm import tqdm | |
| # Suppress SSL warnings (phishing sites often have invalid certs) | |
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
| warnings.filterwarnings('ignore', message='.*Unverified HTTPS.*') | |
| # --------------------------------------------------------------------------- | |
| # Project setup | |
| # --------------------------------------------------------------------------- | |
| PROJECT_ROOT = Path(__file__).resolve().parents[2] # src/ | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from scripts.feature_extraction.url.url_features_v3 import URLFeatureExtractorOptimized | |
| from scripts.feature_extraction.html.html_feature_extractor import HTMLFeatureExtractor | |
| from scripts.feature_extraction.html.feature_engineering import engineer_features | |
| # --------------------------------------------------------------------------- | |
| # Logging | |
| # --------------------------------------------------------------------------- | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| datefmt='%H:%M:%S', | |
| ) | |
| logger = logging.getLogger('extract_combined') | |
| # --------------------------------------------------------------------------- | |
| # Constants | |
| # --------------------------------------------------------------------------- | |
| HEADERS = { | |
| 'User-Agent': ( | |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | |
| 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' | |
| ), | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| } | |
| CHECKPOINT_FILE = PROJECT_ROOT / 'data' / 'features' / '_combined_checkpoint.csv' | |
| # --------------------------------------------------------------------------- | |
| # Feature extraction for a single URL (runs in thread) | |
| # --------------------------------------------------------------------------- | |
| def extract_single( | |
| url: str, | |
| label: int, | |
| url_extractor: URLFeatureExtractorOptimized, | |
| html_extractor: HTMLFeatureExtractor, | |
| timeout: int = 10, | |
| ) -> dict | None: | |
| """ | |
| Extract URL + HTML features for a single URL. | |
| Returns: | |
| Combined feature dict with url, label, and all features, | |
| or None on total failure. | |
| """ | |
| result = {'url': url, 'label': label} | |
| # --- 1. URL features (always succeeds) --- | |
| try: | |
| url_feats = url_extractor.extract_features(url) | |
| for k, v in url_feats.items(): | |
| result[f'url_{k}'] = v | |
| except Exception as e: | |
| logger.debug(f"URL feature error for {url}: {e}") | |
| return None | |
| # --- 2. Download HTML & extract HTML features --- | |
| html_ok = False | |
| try: | |
| resp = requests.get( | |
| url, timeout=timeout, verify=False, headers=HEADERS, | |
| allow_redirects=True, | |
| ) | |
| if resp.status_code == 200 and len(resp.text) > 200: | |
| raw_feats = html_extractor.extract_features(resp.text) | |
| # Apply feature engineering | |
| raw_df = pd.DataFrame([raw_feats]) | |
| eng_df = engineer_features(raw_df) | |
| eng_row = eng_df.iloc[0].to_dict() | |
| for k, v in eng_row.items(): | |
| result[f'html_{k}'] = v | |
| html_ok = True | |
| except Exception: | |
| pass | |
| if not html_ok: | |
| # Fill HTML features with zeros | |
| dummy_html = html_extractor.extract_features('') | |
| dummy_df = pd.DataFrame([dummy_html]) | |
| eng_df = engineer_features(dummy_df) | |
| for k in eng_df.columns: | |
| result[f'html_{k}'] = 0 | |
| return result | |
| # --------------------------------------------------------------------------- | |
| # Batch extraction with threading + checkpointing | |
| # --------------------------------------------------------------------------- | |
| def extract_all( | |
| df: pd.DataFrame, | |
| max_workers: int = 10, | |
| timeout: int = 10, | |
| checkpoint_every: int = 500, | |
| ) -> pd.DataFrame: | |
| """ | |
| Extract combined features for all URLs using thread pool. | |
| Args: | |
| df: DataFrame with 'url' and 'label' columns. | |
| max_workers: Parallel download threads. | |
| timeout: HTTP timeout per URL (seconds). | |
| checkpoint_every: Save intermediate results every N rows. | |
| Returns: | |
| DataFrame with combined features. | |
| """ | |
| url_extractor = URLFeatureExtractorOptimized() | |
| html_extractor = HTMLFeatureExtractor() | |
| urls = df['url'].tolist() | |
| labels = df['label'].tolist() | |
| total = len(urls) | |
| # --- Load checkpoint if exists --- | |
| done_urls = set() | |
| results = [] | |
| if CHECKPOINT_FILE.exists(): | |
| ckpt = pd.read_csv(CHECKPOINT_FILE) | |
| done_urls = set(ckpt['url'].tolist()) | |
| results = ckpt.to_dict('records') | |
| logger.info(f"Resuming from checkpoint: {len(done_urls):,} URLs already done") | |
| remaining = [(u, l) for u, l in zip(urls, labels) if u not in done_urls] | |
| logger.info(f"Remaining URLs to process: {len(remaining):,} / {total:,}") | |
| if not remaining: | |
| logger.info("All URLs already processed!") | |
| return pd.DataFrame(results) | |
| lock = Lock() | |
| n_success = 0 | |
| n_html_fail = 0 | |
| n_fail = 0 | |
| t_start = time.perf_counter() | |
| def _worker(url_label): | |
| u, l = url_label | |
| return extract_single(u, l, url_extractor, html_extractor, timeout) | |
| with ThreadPoolExecutor(max_workers=max_workers) as pool: | |
| futures = {pool.submit(_worker, item): item for item in remaining} | |
| with tqdm(total=len(remaining), desc='Extracting', unit='url') as pbar: | |
| for future in as_completed(futures): | |
| pbar.update(1) | |
| result = future.result() | |
| with lock: | |
| if result is not None: | |
| results.append(result) | |
| n_success += 1 | |
| # Check if HTML was zero-filled | |
| if result.get('html_num_tags', 0) == 0: | |
| n_html_fail += 1 | |
| else: | |
| n_fail += 1 | |
| # Checkpoint | |
| if len(results) % checkpoint_every == 0: | |
| _save_checkpoint(results) | |
| elapsed = time.perf_counter() - t_start | |
| speed = len(remaining) / elapsed if elapsed > 0 else 0 | |
| logger.info(f"\nExtraction complete in {elapsed:.1f}s ({speed:.0f} URLs/sec)") | |
| logger.info(f" Successful: {n_success:,}") | |
| logger.info(f" HTML download failed (zero-filled): {n_html_fail:,}") | |
| logger.info(f" Total failures (skipped): {n_fail:,}") | |
| # Final checkpoint | |
| _save_checkpoint(results) | |
| return pd.DataFrame(results) | |
| def _save_checkpoint(results: list): | |
| """Save intermediate results to checkpoint file.""" | |
| CHECKPOINT_FILE.parent.mkdir(parents=True, exist_ok=True) | |
| pd.DataFrame(results).to_csv(CHECKPOINT_FILE, index=False) | |
| # --------------------------------------------------------------------------- | |
| # Balance dataset | |
| # --------------------------------------------------------------------------- | |
| def balance_dataset(df: pd.DataFrame, random_state: int = 42) -> pd.DataFrame: | |
| """Undersample majority class to balance the dataset.""" | |
| counts = df['label'].value_counts() | |
| min_count = counts.min() | |
| logger.info(f"Balancing: {counts.to_dict()} → {min_count:,} per class") | |
| balanced = ( | |
| df.groupby('label', group_keys=False) | |
| .apply(lambda g: g.sample(n=min_count, random_state=random_state)) | |
| ) | |
| balanced = balanced.sample(frac=1, random_state=random_state).reset_index(drop=True) | |
| return balanced | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Extract combined URL + HTML features from clean_dataset.csv') | |
| parser.add_argument('--input', type=str, | |
| default='data/processed/clean_dataset.csv', | |
| help='Input CSV with url,label columns') | |
| parser.add_argument('--output', type=str, | |
| default='data/features/combined_features.csv', | |
| help='Output CSV path') | |
| parser.add_argument('--workers', type=int, default=10, | |
| help='Parallel download threads (default: 10)') | |
| parser.add_argument('--timeout', type=int, default=10, | |
| help='HTTP timeout in seconds (default: 10)') | |
| parser.add_argument('--limit', type=int, default=None, | |
| help='Limit total URLs (for testing)') | |
| parser.add_argument('--checkpoint-every', type=int, default=500, | |
| help='Save checkpoint every N URLs (default: 500)') | |
| parser.add_argument('--no-balance', action='store_true', | |
| help='Do not balance the output dataset') | |
| args = parser.parse_args() | |
| input_path = (PROJECT_ROOT / args.input).resolve() | |
| output_path = (PROJECT_ROOT / args.output).resolve() | |
| logger.info("=" * 70) | |
| logger.info("COMBINED URL + HTML FEATURE EXTRACTION") | |
| logger.info("=" * 70) | |
| logger.info(f" Input: {input_path}") | |
| logger.info(f" Output: {output_path}") | |
| logger.info(f" Workers: {args.workers}") | |
| logger.info(f" Timeout: {args.timeout}s") | |
| logger.info(f" Balance: {'YES' if not args.no_balance else 'NO'}") | |
| # --- Load dataset --- | |
| df = pd.read_csv(input_path) | |
| logger.info(f"\nLoaded {len(df):,} URLs") | |
| logger.info(f" Label distribution: {df['label'].value_counts().to_dict()}") | |
| if args.limit: | |
| # Stratified limit | |
| per_class = args.limit // 2 | |
| df = ( | |
| df.groupby('label', group_keys=False) | |
| .apply(lambda g: g.sample(n=min(per_class, len(g)), random_state=42)) | |
| ) | |
| df = df.reset_index(drop=True) | |
| logger.info(f" Limited to: {len(df):,} URLs") | |
| # --- Extract features --- | |
| features_df = extract_all( | |
| df, | |
| max_workers=args.workers, | |
| timeout=args.timeout, | |
| checkpoint_every=args.checkpoint_every, | |
| ) | |
| if features_df.empty: | |
| logger.error("No features extracted!") | |
| sys.exit(1) | |
| logger.info(f"\nExtracted features: {features_df.shape}") | |
| logger.info(f" Label distribution: {features_df['label'].value_counts().to_dict()}") | |
| # --- Balance --- | |
| if not args.no_balance: | |
| features_df = balance_dataset(features_df) | |
| logger.info(f" After balancing: {features_df.shape}") | |
| logger.info(f" Label dist: {features_df['label'].value_counts().to_dict()}") | |
| # --- Reorder columns: url, label first, then sorted features --- | |
| meta_cols = ['url', 'label'] | |
| feature_cols = sorted([c for c in features_df.columns if c not in meta_cols]) | |
| features_df = features_df[meta_cols + feature_cols] | |
| # --- Clean up infinities / NaNs --- | |
| features_df = features_df.replace([np.inf, -np.inf], 0) | |
| features_df = features_df.fillna(0) | |
| # --- Save --- | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| features_df.to_csv(output_path, index=False) | |
| # --- Cleanup checkpoint --- | |
| if CHECKPOINT_FILE.exists(): | |
| CHECKPOINT_FILE.unlink() | |
| logger.info("Checkpoint file cleaned up") | |
| # --- Summary --- | |
| logger.info("\n" + "=" * 70) | |
| logger.info("EXTRACTION COMPLETE") | |
| logger.info("=" * 70) | |
| logger.info(f" Total samples: {len(features_df):,}") | |
| logger.info(f" Legitimate: {(features_df['label'] == 0).sum():,}") | |
| logger.info(f" Phishing: {(features_df['label'] == 1).sum():,}") | |
| logger.info(f" Total features: {len(feature_cols)}") | |
| url_feats = [c for c in feature_cols if c.startswith('url_')] | |
| html_feats = [c for c in feature_cols if c.startswith('html_')] | |
| logger.info(f" URL features: {len(url_feats)}") | |
| logger.info(f" HTML features: {len(html_feats)}") | |
| logger.info(f" Output: {output_path}") | |
| logger.info("=" * 70) | |
| if __name__ == '__main__': | |
| main() | |