Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

File size: 13,088 Bytes

2cc7f91

"""

Combined URL + HTML Feature Extraction from clean_dataset.csv



Reads URLs from clean_dataset.csv, extracts URL features and downloads HTML

to extract HTML features, combines them into a single feature dataset.

Produces a balanced combined_features.csv.



Usage:

    python scripts/feature_extraction/extract_combined_features.py

    python scripts/feature_extraction/extract_combined_features.py --workers 20 --timeout 15

    python scripts/feature_extraction/extract_combined_features.py --limit 1000 --no-balance

"""
import argparse
import logging
import random
import sys
import time
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from threading import Lock

import numpy as np
import pandas as pd
import requests
import urllib3
from tqdm import tqdm

# Suppress SSL warnings (phishing sites often have invalid certs)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
warnings.filterwarnings('ignore', message='.*Unverified HTTPS.*')

# ---------------------------------------------------------------------------
# Project setup
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).resolve().parents[2]  # src/
sys.path.insert(0, str(PROJECT_ROOT))

from scripts.feature_extraction.url.url_features_v3 import URLFeatureExtractorOptimized
from scripts.feature_extraction.html.html_feature_extractor import HTMLFeatureExtractor
from scripts.feature_extraction.html.feature_engineering import engineer_features

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S',
)
logger = logging.getLogger('extract_combined')

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
HEADERS = {
    'User-Agent': (
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    ),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
}

CHECKPOINT_FILE = PROJECT_ROOT / 'data' / 'features' / '_combined_checkpoint.csv'


# ---------------------------------------------------------------------------
# Feature extraction for a single URL (runs in thread)
# ---------------------------------------------------------------------------
def extract_single(

    url: str,

    label: int,

    url_extractor: URLFeatureExtractorOptimized,

    html_extractor: HTMLFeatureExtractor,

    timeout: int = 10,

) -> dict | None:
    """

    Extract URL + HTML features for a single URL.



    Returns:

        Combined feature dict with url, label, and all features,

        or None on total failure.

    """
    result = {'url': url, 'label': label}

    # --- 1. URL features (always succeeds) ---
    try:
        url_feats = url_extractor.extract_features(url)
        for k, v in url_feats.items():
            result[f'url_{k}'] = v
    except Exception as e:
        logger.debug(f"URL feature error for {url}: {e}")
        return None

    # --- 2. Download HTML & extract HTML features ---
    html_ok = False
    try:
        resp = requests.get(
            url, timeout=timeout, verify=False, headers=HEADERS,
            allow_redirects=True,
        )
        if resp.status_code == 200 and len(resp.text) > 200:
            raw_feats = html_extractor.extract_features(resp.text)
            # Apply feature engineering
            raw_df = pd.DataFrame([raw_feats])
            eng_df = engineer_features(raw_df)
            eng_row = eng_df.iloc[0].to_dict()
            for k, v in eng_row.items():
                result[f'html_{k}'] = v
            html_ok = True
    except Exception:
        pass

    if not html_ok:
        # Fill HTML features with zeros
        dummy_html = html_extractor.extract_features('')
        dummy_df = pd.DataFrame([dummy_html])
        eng_df = engineer_features(dummy_df)
        for k in eng_df.columns:
            result[f'html_{k}'] = 0

    return result


# ---------------------------------------------------------------------------
# Batch extraction with threading + checkpointing
# ---------------------------------------------------------------------------
def extract_all(

    df: pd.DataFrame,

    max_workers: int = 10,

    timeout: int = 10,

    checkpoint_every: int = 500,

) -> pd.DataFrame:
    """

    Extract combined features for all URLs using thread pool.



    Args:

        df: DataFrame with 'url' and 'label' columns.

        max_workers: Parallel download threads.

        timeout: HTTP timeout per URL (seconds).

        checkpoint_every: Save intermediate results every N rows.



    Returns:

        DataFrame with combined features.

    """
    url_extractor = URLFeatureExtractorOptimized()
    html_extractor = HTMLFeatureExtractor()

    urls = df['url'].tolist()
    labels = df['label'].tolist()
    total = len(urls)

    # --- Load checkpoint if exists ---
    done_urls = set()
    results = []
    if CHECKPOINT_FILE.exists():
        ckpt = pd.read_csv(CHECKPOINT_FILE)
        done_urls = set(ckpt['url'].tolist())
        results = ckpt.to_dict('records')
        logger.info(f"Resuming from checkpoint: {len(done_urls):,} URLs already done")

    remaining = [(u, l) for u, l in zip(urls, labels) if u not in done_urls]
    logger.info(f"Remaining URLs to process: {len(remaining):,} / {total:,}")

    if not remaining:
        logger.info("All URLs already processed!")
        return pd.DataFrame(results)

    lock = Lock()
    n_success = 0
    n_html_fail = 0
    n_fail = 0
    t_start = time.perf_counter()

    def _worker(url_label):
        u, l = url_label
        return extract_single(u, l, url_extractor, html_extractor, timeout)

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {pool.submit(_worker, item): item for item in remaining}

        with tqdm(total=len(remaining), desc='Extracting', unit='url') as pbar:
            for future in as_completed(futures):
                pbar.update(1)
                result = future.result()

                with lock:
                    if result is not None:
                        results.append(result)
                        n_success += 1

                        # Check if HTML was zero-filled
                        if result.get('html_num_tags', 0) == 0:
                            n_html_fail += 1
                    else:
                        n_fail += 1

                    # Checkpoint
                    if len(results) % checkpoint_every == 0:
                        _save_checkpoint(results)

    elapsed = time.perf_counter() - t_start
    speed = len(remaining) / elapsed if elapsed > 0 else 0

    logger.info(f"\nExtraction complete in {elapsed:.1f}s ({speed:.0f} URLs/sec)")
    logger.info(f"  Successful: {n_success:,}")
    logger.info(f"  HTML download failed (zero-filled): {n_html_fail:,}")
    logger.info(f"  Total failures (skipped): {n_fail:,}")

    # Final checkpoint
    _save_checkpoint(results)

    return pd.DataFrame(results)


def _save_checkpoint(results: list):
    """Save intermediate results to checkpoint file."""
    CHECKPOINT_FILE.parent.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(results).to_csv(CHECKPOINT_FILE, index=False)


# ---------------------------------------------------------------------------
# Balance dataset
# ---------------------------------------------------------------------------
def balance_dataset(df: pd.DataFrame, random_state: int = 42) -> pd.DataFrame:
    """Undersample majority class to balance the dataset."""
    counts = df['label'].value_counts()
    min_count = counts.min()
    logger.info(f"Balancing: {counts.to_dict()} → {min_count:,} per class")

    balanced = (
        df.groupby('label', group_keys=False)
          .apply(lambda g: g.sample(n=min_count, random_state=random_state))
    )
    balanced = balanced.sample(frac=1, random_state=random_state).reset_index(drop=True)
    return balanced


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
    parser = argparse.ArgumentParser(
        description='Extract combined URL + HTML features from clean_dataset.csv')
    parser.add_argument('--input', type=str,
                        default='data/processed/clean_dataset.csv',
                        help='Input CSV with url,label columns')
    parser.add_argument('--output', type=str,
                        default='data/features/combined_features.csv',
                        help='Output CSV path')
    parser.add_argument('--workers', type=int, default=10,
                        help='Parallel download threads (default: 10)')
    parser.add_argument('--timeout', type=int, default=10,
                        help='HTTP timeout in seconds (default: 10)')
    parser.add_argument('--limit', type=int, default=None,
                        help='Limit total URLs (for testing)')
    parser.add_argument('--checkpoint-every', type=int, default=500,
                        help='Save checkpoint every N URLs (default: 500)')
    parser.add_argument('--no-balance', action='store_true',
                        help='Do not balance the output dataset')
    args = parser.parse_args()

    input_path = (PROJECT_ROOT / args.input).resolve()
    output_path = (PROJECT_ROOT / args.output).resolve()

    logger.info("=" * 70)
    logger.info("COMBINED URL + HTML FEATURE EXTRACTION")
    logger.info("=" * 70)
    logger.info(f"  Input:    {input_path}")
    logger.info(f"  Output:   {output_path}")
    logger.info(f"  Workers:  {args.workers}")
    logger.info(f"  Timeout:  {args.timeout}s")
    logger.info(f"  Balance:  {'YES' if not args.no_balance else 'NO'}")

    # --- Load dataset ---
    df = pd.read_csv(input_path)
    logger.info(f"\nLoaded {len(df):,} URLs")
    logger.info(f"  Label distribution: {df['label'].value_counts().to_dict()}")

    if args.limit:
        # Stratified limit
        per_class = args.limit // 2
        df = (
            df.groupby('label', group_keys=False)
              .apply(lambda g: g.sample(n=min(per_class, len(g)), random_state=42))
        )
        df = df.reset_index(drop=True)
        logger.info(f"  Limited to: {len(df):,} URLs")

    # --- Extract features ---
    features_df = extract_all(
        df,
        max_workers=args.workers,
        timeout=args.timeout,
        checkpoint_every=args.checkpoint_every,
    )

    if features_df.empty:
        logger.error("No features extracted!")
        sys.exit(1)

    logger.info(f"\nExtracted features: {features_df.shape}")
    logger.info(f"  Label distribution: {features_df['label'].value_counts().to_dict()}")

    # --- Balance ---
    if not args.no_balance:
        features_df = balance_dataset(features_df)
        logger.info(f"  After balancing: {features_df.shape}")
        logger.info(f"  Label dist: {features_df['label'].value_counts().to_dict()}")

    # --- Reorder columns: url, label first, then sorted features ---
    meta_cols = ['url', 'label']
    feature_cols = sorted([c for c in features_df.columns if c not in meta_cols])
    features_df = features_df[meta_cols + feature_cols]

    # --- Clean up infinities / NaNs ---
    features_df = features_df.replace([np.inf, -np.inf], 0)
    features_df = features_df.fillna(0)

    # --- Save ---
    output_path.parent.mkdir(parents=True, exist_ok=True)
    features_df.to_csv(output_path, index=False)

    # --- Cleanup checkpoint ---
    if CHECKPOINT_FILE.exists():
        CHECKPOINT_FILE.unlink()
        logger.info("Checkpoint file cleaned up")

    # --- Summary ---
    logger.info("\n" + "=" * 70)
    logger.info("EXTRACTION COMPLETE")
    logger.info("=" * 70)
    logger.info(f"  Total samples:  {len(features_df):,}")
    logger.info(f"    Legitimate:   {(features_df['label'] == 0).sum():,}")
    logger.info(f"    Phishing:     {(features_df['label'] == 1).sum():,}")
    logger.info(f"  Total features: {len(feature_cols)}")
    url_feats = [c for c in feature_cols if c.startswith('url_')]
    html_feats = [c for c in feature_cols if c.startswith('html_')]
    logger.info(f"    URL features:  {len(url_feats)}")
    logger.info(f"    HTML features: {len(html_feats)}")
    logger.info(f"  Output: {output_path}")
    logger.info("=" * 70)


if __name__ == '__main__':
    main()