Spaces:

xxemrzru
/

url-phish-fastapi

Runtime error

App Files Files Community

Rasel Santillan commited on Jan 9

Commit

badc9ad

1 Parent(s): 4666553

Update

Browse files

Files changed (2) hide show

model/model.py +10 -23
model/url_feature_extractor.py +830 -240

model/model.py CHANGED Viewed

@@ -10,8 +10,8 @@ from typing import Dict, Any, Optional, Tuple
 import warnings
 from huggingface_hub import hf_hub_download
-# Import feature extraction function and feature names
-from .url_feature_extractor import extract_features, FEATURE_NAMES
 warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
@@ -68,22 +68,13 @@ def load_model() -> Dict[str, Any]:
         logger.info(f"Loading model from: {model_path}")
         model_data = joblib.load(model_path)
-        # Use feature names from the saved model as the source of truth (Option A)
-        effective_feature_names = model_data["feature_names"]
-        # Validate against extractor's current schema and log if mismatched
-        if list(effective_feature_names) != list(FEATURE_NAMES):
-            logger.warning(
-                "Saved model feature schema differs from extractor FEATURE_NAMES. "
-                "Proceeding with saved model schema as authoritative."
-            )
         # Cache the model
         _model_cache = {
             "base_models": model_data["base_models"],
             "meta_scaler": model_data["meta_scaler"],
             "scaler_name": model_data.get("scaler_name", "Unknown"),
             "meta_model": model_data["meta_model"],
-            "feature_names": effective_feature_names,
             "model_names": model_data["model_names"]
         }
@@ -119,13 +110,12 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
     # Convert to DataFrame to ensure shape consistency
     X = pd.DataFrame([features_dict])
-    # Ensure all required columns exist (auto-fill missing with 0) and order correctly
-    missing_cols = list(set(feature_names) - set(X.columns))
     if missing_cols:
-        logger.warning(f"⚠ Missing required features in input, filling with 0: {sorted(missing_cols)}")
-        for col in missing_cols:
-            X[col] = 0
-    # Drop any unexpected columns
     X = X[feature_names]
     # Level 0: Base model predictions
@@ -303,11 +293,9 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
         # Convert to DataFrame and ensure proper ordering
         X = pd.DataFrame([features_dict])
-        missing_cols = list(set(feature_names) - set(X.columns))
         if missing_cols:
-            logger.warning(f"⚠ Missing required features during update, filling with 0: {sorted(missing_cols)}")
-            for col in missing_cols:
-                X[col] = 0
         X = X[feature_names]
         # Generate meta-features using base models (probability outputs)
@@ -357,7 +345,6 @@ def save_updated_model(model_components: Dict[str, Any], updated_meta_model) ->
             "meta_scaler": model_components["meta_scaler"],
             "scaler_name": model_components.get("scaler_name", "Unknown"),
             "meta_model": updated_meta_model,  # Use the updated meta model
-            # Persist the model's authoritative feature schema
             "feature_names": model_components["feature_names"],
             "model_names": model_components["model_names"]
         }

 import warnings
 from huggingface_hub import hf_hub_download
+# Import feature extraction function
+from .url_feature_extractor import extract_features
 warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
         logger.info(f"Loading model from: {model_path}")
         model_data = joblib.load(model_path)
         # Cache the model
         _model_cache = {
             "base_models": model_data["base_models"],
             "meta_scaler": model_data["meta_scaler"],
             "scaler_name": model_data.get("scaler_name", "Unknown"),
             "meta_model": model_data["meta_model"],
+            "feature_names": model_data["feature_names"],
             "model_names": model_data["model_names"]
         }
     # Convert to DataFrame to ensure shape consistency
     X = pd.DataFrame([features_dict])
+    # Ensure all required columns exist
+    missing_cols = set(feature_names) - set(X.columns)
     if missing_cols:
+        raise ValueError(f"❌ Missing required features: {missing_cols}")
+    # Keep only known features and order them correctly
     X = X[feature_names]
     # Level 0: Base model predictions
         # Convert to DataFrame and ensure proper ordering
         X = pd.DataFrame([features_dict])
+        missing_cols = set(feature_names) - set(X.columns)
         if missing_cols:
+            raise ValueError(f"Missing required features: {missing_cols}")
         X = X[feature_names]
         # Generate meta-features using base models (probability outputs)
             "meta_scaler": model_components["meta_scaler"],
             "scaler_name": model_components.get("scaler_name", "Unknown"),
             "meta_model": updated_meta_model,  # Use the updated meta model
             "feature_names": model_components["feature_names"],
             "model_names": model_components["model_names"]
         }

model/url_feature_extractor.py CHANGED Viewed

@@ -1,330 +1,920 @@
 """
 URL Feature Extraction System for Phishing Detection
-Extracts 16 URL-based features for phishing URL classification.
-No network requests required - all features extracted from URL string only.
 """
-import re
-import math
 import logging
-from urllib.parse import urlparse, parse_qs
-from typing import Dict
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Popular/common TLDs for tld_popularity check
-POPULAR_TLDS = {
-    'com', 'org', 'net', 'edu', 'gov', 'mil', 'int',
-    'co', 'io', 'info', 'biz', 'us', 'uk', 'ca', 'au',
-    'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es',
-    'nl', 'se', 'no', 'fi', 'dk', 'ch', 'at', 'be', 'pl',
-    'pt', 'ie', 'nz', 'za', 'mx', 'ar', 'cl', 'kr', 'tw',
-    'sg', 'hk', 'my', 'th', 'id', 'ph', 'vn', 'ae', 'sa'
-}
-# Suspicious file extensions
-SUSPICIOUS_EXTENSIONS = {
-    '.exe', '.zip', '.scr', '.bat', '.cmd', '.msi', '.dll',
-    '.pif', '.com', '.vbs', '.js', '.jar', '.wsf', '.ps1',
-    '.rar', '.7z', '.tar', '.gz', '.iso', '.dmg', '.apk'
-}
-# Regex pattern for detecting IP addresses in URL
-IP_ADDRESS_PATTERN = re.compile(
-    r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
-)
-# IPv6 pattern (simplified)
-IPV6_PATTERN = re.compile(r'^\[?[0-9a-fA-F:]+\]?$')
-def preprocess_url(url):
     """
-    Add http:// scheme to URL if missing.
     Args:
-        url (str): Original URL
     Returns:
-        str: URL with scheme
     """
-    url = url.strip()
-    if not url.startswith(('http://', 'https://')):
-        return f'http://{url}'
-    return url
-def calculate_shannon_entropy(text: str) -> int:
     """
-    Calculate Shannon entropy of a string.
     Args:
-        text: Input string
     Returns:
-        int: Shannon entropy value (multiplied by 100 and rounded for integer output)
     """
-    if not text:
-        return 0
-    # Calculate character frequencies
-    freq = {}
-    for char in text:
-        freq[char] = freq.get(char, 0) + 1
-    # Calculate entropy
-    length = len(text)
-    entropy = 0.0
-    for count in freq.values():
-        probability = count / length
-        if probability > 0:
-            entropy -= probability * math.log2(probability)
-    # Return entropy * 100 as integer (to preserve precision while keeping int type)
-    return int(entropy * 100)
-def extract_tld(hostname: str) -> str:
     """
-    Extract TLD from hostname.
     Args:
-        hostname: Domain hostname (e.g., 'www.example.com')
     Returns:
-        str: TLD (e.g., 'com')
     """
-    if not hostname:
-        return ''
-    # Remove port if present
-    hostname = hostname.split(':')[0]
-    parts = hostname.split('.')
-    if len(parts) >= 1:
-        return parts[-1].lower()
-    return ''
-def extract_domain_name(hostname: str) -> str:
     """
-    Extract the main domain name (excluding subdomains and TLD).
     Args:
-        hostname: Domain hostname (e.g., 'www.mail.example.com')
     Returns:
-        str: Domain name only (e.g., 'example')
     """
-    if not hostname:
-        return ''
-    # Remove port if present
-    hostname = hostname.split(':')[0]
-    parts = hostname.split('.')
-    if len(parts) >= 2:
-        # Return second-to-last part (domain name)
-        return parts[-2]
-    elif len(parts) == 1:
-        return parts[0]
-    return ''
-def count_subdomains(hostname: str) -> int:
     """
-    Count the number of subdomains in hostname.
     Args:
-        hostname: Domain hostname (e.g., 'www.mail.example.com')
     Returns:
-        int: Number of subdomains (e.g., 2 for www.mail.example.com)
     """
-    if not hostname:
-        return 0
-    # Remove port if present
-    hostname = hostname.split(':')[0]
-    parts = hostname.split('.')
-    # Subdomains = total parts - TLD - domain name
-    # e.g., www.mail.example.com has 4 parts, so 4 - 2 = 2 subdomains
-    if len(parts) > 2:
-        return len(parts) - 2
-    return 0
-def has_ip_address_in_url(hostname: str) -> int:
     """
-    Check if the hostname is an IP address.
     Args:
-        hostname: Domain hostname
     Returns:
-        int: 1 if IP address, 0 otherwise
     """
-    if not hostname:
-        return 0
-    # Remove port if present
-    hostname = hostname.split(':')[0]
-    # Check IPv4
-    if IP_ADDRESS_PATTERN.match(hostname):
-        return 1
-    # Check IPv6
-    if IPV6_PATTERN.match(hostname):
-        return 1
-    return 0
-def extract_features(url: str) -> Dict[str, int]:
-    """
-    Extract 16 URL-based features from a URL string.
-    No network requests are made - all features are extracted from the URL string only.
     Args:
         url (str): URL to extract features from
     Returns:
-        dict: Dictionary containing 16 features as integers.
-              - Binary flags: 0 or 1
-              - Counts and lengths: >= 0
-              - Percentages: 0-100 (for percentage_numeric_chars)
-              - Entropy: 0-800 (shannon entropy * 100)
     """
-    # Initialize features with default values
     features = {
-        'url_length': 0,
-        'has_ip_address': 0,
-        'dot_count': 0,
-        'https_flag': 0,
-        'url_entropy': 0,
-        'token_count': 0,
-        'subdomain_count': 0,
-        'query_param_count': 0,
-        'tld_length': 0,
-        'path_length': 0,
-        'has_hyphen_in_domain': 0,
-        'number_of_digits': 0,
-        'tld_popularity': 0,
-        'suspicious_file_extension': 0,
-        'domain_name_length': 0,
-        'percentage_numeric_chars': 0
     }
-    try:
-        # Preprocess URL to ensure it has a scheme
-        processed_url = preprocess_url(url)
-        # Parse URL
-        parsed = urlparse(processed_url)
-        hostname = parsed.netloc or ''
-        path = parsed.path or ''
-        query = parsed.query or ''
-        # 1. url_length - Total number of characters in the URL
-        features['url_length'] = len(processed_url)
-        # 2. has_ip_address - Whether URL contains IP address instead of domain
-        features['has_ip_address'] = has_ip_address_in_url(hostname)
-        # 3. dot_count - Number of dots in URL
-        features['dot_count'] = processed_url.count('.')
-        # 4. https_flag - Whether URL uses HTTPS
-        features['https_flag'] = 1 if parsed.scheme.lower() == 'https' else 0
-        # 5. url_entropy - Shannon entropy of URL string (x100 for int)
-        features['url_entropy'] = calculate_shannon_entropy(processed_url)
-        # 6. token_count - Number of tokens separated by delimiters
-        # Delimiters: /, -, _, ., ?, &, =
-        tokens = re.split(r'[/\-_\.?&=]+', processed_url)
-        # Filter out empty tokens and scheme
-        tokens = [t for t in tokens if t and t not in ('http', 'https', '')]
-        features['token_count'] = len(tokens)
-        # 7. subdomain_count - Number of subdomains
-        features['subdomain_count'] = count_subdomains(hostname)
-        # 8. query_param_count - Number of query parameters
-        if query:
-            query_params = parse_qs(query, keep_blank_values=True)
-            features['query_param_count'] = len(query_params)
-        else:
-            features['query_param_count'] = 0
-        # 9. tld_length - Length of TLD
-        tld = extract_tld(hostname)
-        features['tld_length'] = len(tld)
-        # 10. path_length - Length of path portion
-        features['path_length'] = len(path)
-        # 11. has_hyphen_in_domain - Whether domain contains hyphen
-        domain_name = extract_domain_name(hostname)
-        features['has_hyphen_in_domain'] = 1 if '-' in domain_name else 0
-        # 12. number_of_digits - Total count of digits in URL
-        features['number_of_digits'] = sum(1 for c in processed_url if c.isdigit())
-        # 13. tld_popularity - Whether TLD is popular/common
-        features['tld_popularity'] = 1 if tld.lower() in POPULAR_TLDS else 0
-        # 14. suspicious_file_extension - Whether URL ends with suspicious extension
-        url_lower = processed_url.lower()
-        has_suspicious = 0
-        for ext in SUSPICIOUS_EXTENSIONS:
-            if url_lower.endswith(ext):
-                has_suspicious = 1
-                break
-        features['suspicious_file_extension'] = has_suspicious
-        # 15. domain_name_length - Length of domain name only
-        features['domain_name_length'] = len(domain_name)
-        # 16. percentage_numeric_chars - Percentage of numeric chars (0-100)
-        if len(processed_url) > 0:
-            digit_count = sum(1 for c in processed_url if c.isdigit())
-            percentage = (digit_count / len(processed_url)) * 100
-            features['percentage_numeric_chars'] = int(percentage)
         else:
-            features['percentage_numeric_chars'] = 0
-        logger.info(f"✓ Successfully extracted 16 URL features from: {url}")
     except Exception as e:
-        logger.error(f"✗ Error extracting features from {url}: {type(e).__name__}: {str(e)}")
-        # Return default values on error
-    return features
-# List of feature names for external reference
-FEATURE_NAMES = [
-    'url_length',
-    'has_ip_address',
-    'dot_count',
-    'https_flag',
-    'url_entropy',
-    'token_count',
-    'subdomain_count',
-    'query_param_count',
-    'tld_length',
-    'path_length',
-    'has_hyphen_in_domain',
-    'number_of_digits',
-    'tld_popularity',
-    'suspicious_file_extension',
-    'domain_name_length',
-    'percentage_numeric_chars'
-]

 """
 URL Feature Extraction System for Phishing Detection
+Extracts 43 specific features from URLs and their corresponding webpages.
 """
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from urllib.parse import urlparse
+import warnings
+import time
 import logging
+import numpy as np
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from functools import wraps
+import asyncio
+import sys
+# Playwright imports (optional - graceful degradation if not installed)
+try:
+    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
+    PLAYWRIGHT_AVAILABLE = True
+except ImportError:
+    PLAYWRIGHT_AVAILABLE = False
+    PlaywrightTimeoutError = Exception  # Fallback for type hints
+warnings.filterwarnings('ignore')
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def _is_running_in_event_loop():
+    """
+    Check if code is running inside an asyncio event loop.
+    Returns:
+        bool: True if running in an event loop, False otherwise
+    """
+    try:
+        asyncio.get_running_loop()
+        return True
+    except RuntimeError:
+        return False
+# Configuration constants
+FEATURE_EXTRACTION_MAX_RETRIES = 3
+FEATURE_EXTRACTION_RETRY_DELAY = 0.3  # seconds between retries
+PAGE_LOAD_TIMEOUT = 20  # seconds to wait for page load
+DYNAMIC_CONTENT_WAIT = 3  # seconds to wait for dynamic content after page load
+def retry_feature_extraction(max_retries=FEATURE_EXTRACTION_MAX_RETRIES, delay=FEATURE_EXTRACTION_RETRY_DELAY):
     """
+    Decorator to retry feature extraction with exponential backoff.
     Args:
+        max_retries (int): Maximum number of retry attempts
+        delay (float): Initial delay between retries in seconds
     Returns:
+        Decorated function with retry logic
     """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+            for attempt in range(max_retries):
+                try:
+                    result = func(*args, **kwargs)
+                    # If we got a valid result (not np.nan), return it
+                    if result is not None and not (isinstance(result, float) and np.isnan(result)):
+                        return result
+                    # If result is np.nan or None, retry
+                    if attempt < max_retries - 1:
+                        time.sleep(delay * (attempt + 1))  # Exponential backoff
+                except Exception as e:
+                    last_exception = e
+                    if attempt < max_retries - 1:
+                        time.sleep(delay * (attempt + 1))
+                    continue
+            # All retries exhausted, return np.nan
+            if last_exception:
+                logger.debug(f"Feature extraction failed after {max_retries} attempts: {last_exception}")
+            return np.nan
+        return wrapper
+    return decorator
+def create_playwright_browser():
+    """
+    Create a Playwright browser context for dynamic content extraction.
+    Returns:
+        tuple: (playwright instance, browser, context, page) or (None, None, None, None) if failed
+    """
+    if not PLAYWRIGHT_AVAILABLE:
+        logger.warning("Playwright is not installed. Install with: pip install playwright && playwright install")
+        return None, None, None, None
+    try:
+        # Start Playwright
+        playwright = sync_playwright().start()
+        # Launch browser with stealth options
+        browser = playwright.chromium.launch(
+            headless=True,
+            args=[
+                '--no-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--disable-extensions',
+                '--disable-blink-features=AutomationControlled',
+            ]
+        )
+        # Create context with stealth settings
+        context = browser.new_context(
+            viewport={'width': 1920, 'height': 1080},
+            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            locale='en-US',
+            timezone_id='America/New_York',
+            permissions=[],
+            extra_http_headers={
+                'Accept-Language': 'en-US,en;q=0.9',
+                'DNT': '1',
+            },
+            ignore_https_errors=True,
+        )
+        # Add init script to hide webdriver property
+        context.add_init_script("""
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+            // Override the navigator.plugins to avoid detection
+            Object.defineProperty(navigator, 'plugins', {
+                get: () => [1, 2, 3, 4, 5]
+            });
+            // Override the navigator.languages to avoid detection
+            Object.defineProperty(navigator, 'languages', {
+                get: () => ['en-US', 'en']
+            });
+        """)
+        # Create a new page
+        page = context.new_page()
+        # Set default timeout
+        page.set_default_timeout(PAGE_LOAD_TIMEOUT * 1000)  # Convert to milliseconds
+        logger.info("✓ Playwright browser created successfully")
+        return playwright, browser, context, page
+    except Exception as e:
+        logger.warning(f"Failed to create Playwright browser: {type(e).__name__}: {str(e)[:200]}")
+        logger.info("Playwright will be skipped. Install with: pip install playwright && playwright install")
+        return None, None, None, None
+def fetch_page_with_playwright(url, page=None):
     """
+    Fetch a webpage using Playwright to handle dynamic JavaScript content.
     Args:
+        url (str): URL to fetch
+        page (playwright.sync_api.Page, optional): Existing page instance
     Returns:
+        tuple: (BeautifulSoup object, (playwright, browser, context, page)) or (None, None) if failed
     """
+    resources_created = False
+    playwright_instance = None
+    browser = None
+    context = None
+    try:
+        if page is None:
+            playwright_instance, browser, context, page = create_playwright_browser()
+            resources_created = True
+        if page is None:
+            return None, None
+        logger.info(f"Fetching URL with Playwright: {url}")
+        # Navigate to the URL
+        try:
+            response = page.goto(url, wait_until='networkidle', timeout=PAGE_LOAD_TIMEOUT * 1000)
+            # Check if navigation was successful
+            if response and response.status >= 400:
+                logger.warning(f"Playwright received HTTP {response.status}")
+        except PlaywrightTimeoutError:
+            logger.warning("Playwright navigation timeout, continuing anyway...")
+        except Exception as nav_error:
+            logger.warning(f"Playwright navigation error: {nav_error}")
+            # Continue anyway - page might have partially loaded
+        # Wait for document ready state
+        try:
+            page.wait_for_load_state('domcontentloaded', timeout=10000)
+            page.wait_for_load_state('load', timeout=10000)
+        except PlaywrightTimeoutError:
+            logger.debug("Load state timeout, continuing...")
+        # Additional wait for dynamic content to load
+        time.sleep(DYNAMIC_CONTENT_WAIT)
+        # Wait for body element to be present
+        try:
+            page.wait_for_selector('body', timeout=10000)
+        except PlaywrightTimeoutError:
+            logger.debug("Body selector timeout, continuing...")
+        # Get the fully rendered page source
+        page_source = page.content()
+        # Parse with BeautifulSoup
+        soup = BeautifulSoup(page_source, 'html.parser')
+        logger.info(f"✓ Successfully fetched and rendered page with Playwright")
+        # Return soup and resources (let caller handle cleanup)
+        if resources_created:
+            return soup, (playwright_instance, browser, context, page)
+        else:
+            return soup, None
+    except Exception as e:
+        logger.warning(f"Playwright fetch failed: {type(e).__name__}: {str(e)[:100]}")
+        if resources_created:
+            try:
+                if page:
+                    page.close()
+                if context:
+                    context.close()
+                if browser:
+                    browser.close()
+                if playwright_instance:
+                    playwright_instance.stop()
+            except:
+                pass
+        return None, None
+def fetch_page_with_playwright_safe(url, page=None):
+    """
+    Thread-safe wrapper for fetch_page_with_playwright that works in both sync and async contexts.
+    This function detects if it's running inside an asyncio event loop (e.g., FastAPI/uvicorn)
+    and automatically runs the Playwright sync API in a separate thread to avoid conflicts.
+    Args:
+        url (str): URL to fetch
+        page (playwright.sync_api.Page, optional): Existing page instance
+    Returns:
+        tuple: (BeautifulSoup object, playwright_resources) or (None, None) if failed
     """
+    if _is_running_in_event_loop():
+        # Running in async context (e.g., FastAPI) - use thread pool
+        logger.debug("Detected async context - running Playwright in separate thread")
+        try:
+            # Run the sync function in a thread pool executor
+            # This isolates Playwright's sync API from the asyncio event loop
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(fetch_page_with_playwright, url, page)
+                result = future.result(timeout=PAGE_LOAD_TIMEOUT + 30)  # Add buffer to timeout
+                return result
+        except Exception as e:
+            logger.warning(f"Failed to run Playwright in thread: {type(e).__name__}: {str(e)[:100]}")
+            return None, None
+    else:
+        # Running in sync context (e.g., direct script execution) - call directly
+        logger.debug("Detected sync context - running Playwright directly")
+        return fetch_page_with_playwright(url, page)
+def get_modern_browser_headers(url=None):
+    """
+    Generate modern browser headers to mimic a real Chrome browser.
     Args:
+        url (str, optional): The target URL for setting referer/origin
     Returns:
+        dict: Dictionary of HTTP headers
     """
+    headers = {
+        # Modern Chrome User-Agent (Chrome 120+)
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        # Accept headers
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept-Encoding': 'gzip, deflate, br',
+        # Security headers (Sec-Fetch-* headers)
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        # Additional browser headers
+        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"Windows"',
+        # Connection settings
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+        # DNT (Do Not Track)
+        'DNT': '1',
+        # Cache control
+        'Cache-Control': 'max-age=0',
+    }
+    # Add referer if URL is provided
+    if url:
+        try:
+            parsed = urlparse(url)
+            if parsed.scheme and parsed.netloc:
+                origin = f"{parsed.scheme}://{parsed.netloc}"
+                headers['Origin'] = origin
+                headers['Referer'] = origin + '/'
+        except Exception:
+            pass
+    return headers
+def create_session_with_retries(max_retries=3):
     """
+    Create a requests session with retry logic and connection pooling.
     Args:
+        max_retries (int): Maximum number of retries for failed requests
     Returns:
+        requests.Session: Configured session object
     """
+    session = requests.Session()
+    # Configure retry strategy
+    retry_strategy = Retry(
+        total=max_retries,
+        backoff_factor=1,  # Wait 1s, 2s, 4s between retries
+        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
+        allowed_methods=["GET", "HEAD"],  # Only retry safe methods
+        raise_on_status=False  # Don't raise exception, let us handle it
+    )
+    # Mount adapter with retry strategy
+    adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=10)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+def preprocess_url(url):
     """
+    Add http:// scheme to URL if missing.
     Args:
+        url (str): Original URL
     Returns:
+        str: URL with scheme
     """
+    url = url.strip()
+    if not url.startswith(('http://', 'https://')):
+        return f'http://{url}'
+    return url
+def extract_feature_with_retry(soup, feature_name, extraction_func, max_retries=FEATURE_EXTRACTION_MAX_RETRIES):
     """
+    Extract a single feature with retry logic.
+    All features are returned as integers:
+    - 'has_*' features return binary 0 or 1
+    - 'number_of_*' and 'length_of_*' features return whole numbers (integers)
+    - On failure, returns -1 (instead of np.nan) to maintain integer type consistency
     Args:
+        soup (BeautifulSoup): Parsed HTML content
+        feature_name (str): Name of the feature being extracted
+        extraction_func (callable): Function that performs the extraction
+        max_retries (int): Maximum number of retry attempts
     Returns:
+        int: Feature value as integer, or -1 if all retries fail
     """
+    last_exception = None
+    for attempt in range(max_retries):
+        try:
+            result = extraction_func(soup)
+            # If we got a valid result, cast to int and return it
+            if result is not None and not (isinstance(result, float) and np.isnan(result)):
+                if attempt > 0:
+                    logger.debug(f"Feature '{feature_name}' extracted successfully on attempt {attempt + 1}")
+                # Ensure integer type for all features
+                return int(result)
+            # If result is None or np.nan, retry with a small delay
+            if attempt < max_retries - 1:
+                time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
+        except Exception as e:
+            last_exception = e
+            if attempt < max_retries - 1:
+                logger.debug(f"Retry {attempt + 1}/{max_retries} for '{feature_name}': {type(e).__name__}")
+                time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
+            continue
+    # All retries exhausted - return -1 to indicate failure while maintaining integer type
+    if last_exception:
+        logger.debug(f"Error extracting {feature_name} after {max_retries} attempts: {last_exception}")
+    return -1
+def extract_features(url):
+    """
+    Extract all 43 features from a URL and its webpage.
     Args:
         url (str): URL to extract features from
     Returns:
+        dict: Dictionary containing all 43 features as integers.
+              - 'has_*' features: 0 (not present), 1 (present), or -1 (extraction failed/unreachable)
+              - 'number_of_*' and 'length_of_*' features: >= 0 count/length, or -1 (extraction failed/unreachable)
     """
+    # Initialize all features with -1 (for unreachable sites)
+    # Using -1 instead of None to maintain integer type consistency
     features = {
+        'has_title': -1,
+        'has_input': -1,
+        'has_button': -1,
+        'has_image': -1,
+        'has_submit': -1,
+        'has_link': -1,
+        'has_password': -1,
+        'has_email_input': -1,
+        'has_hidden_element': -1,
+        'has_audio': -1,
+        'has_video': -1,
+        'number_of_inputs': -1,
+        'number_of_buttons': -1,
+        'number_of_images': -1,
+        'number_of_option': -1,
+        'number_of_list': -1,
+        'number_of_th': -1,
+        'number_of_tr': -1,
+        'number_of_href': -1,
+        'number_of_paragraph': -1,
+        'number_of_script': -1,
+        'length_of_title': -1,
+        'has_h1': -1,
+        'has_h2': -1,
+        'has_h3': -1,
+        'length_of_text': -1,
+        'number_of_clickable_button': -1,
+        'number_of_a': -1,
+        'number_of_img': -1,
+        'number_of_div': -1,
+        'number_of_figure': -1,
+        'has_footer': -1,
+        'has_form': -1,
+        'has_text_area': -1,
+        'has_iframe': -1,
+        'has_text_input': -1,
+        'number_of_meta': -1,
+        'has_nav': -1,
+        'has_object': -1,
+        'has_picture': -1,
+        'number_of_sources': -1,
+        'number_of_span': -1,
+        'number_of_table': -1
     }
+    # Preprocess URL
+    processed_url = preprocess_url(url)
+    # Try multiple approaches with increasing robustness
+    response = None
+    soup = None
+    last_error = None
+    # Approach 1: Use session with retry logic and modern headers
+    try:
+        logger.info(f"Attempting to fetch URL with session and retries: {processed_url}")
+        session = create_session_with_retries(max_retries=3)
+        headers = get_modern_browser_headers(processed_url)
+        response = session.get(
+            processed_url,
+            headers=headers,
+            timeout=15,
+            allow_redirects=True,
+            verify=False
+        )
+        # Check if we got a successful response
+        if response.status_code == 200:
+            logger.info(f"✓ Successfully fetched URL (status: {response.status_code})")
+            # Decode content with UTF-8 and replace errors to avoid encoding warnings
+            html_content = response.content.decode('utf-8', errors='replace')
+            soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
         else:
+            logger.warning(f"Received HTTP {response.status_code} for {processed_url}")
+            raise requests.exceptions.HTTPError(f"HTTP {response.status_code}")
+    except requests.exceptions.Timeout as e:
+        last_error = f"Timeout error: Request took longer than 15 seconds"
+        logger.warning(f"✗ {last_error}")
+    except requests.exceptions.ConnectionError as e:
+        last_error = f"Connection error: Unable to establish connection to {processed_url}"
+        logger.warning(f"✗ {last_error}")
+    except requests.exceptions.HTTPError as e:
+        last_error = f"HTTP error: {str(e)}"
+        logger.warning(f"✗ {last_error}")
+    except requests.exceptions.TooManyRedirects as e:
+        last_error = f"Too many redirects: URL redirected too many times"
+        logger.warning(f"✗ {last_error}")
     except Exception as e:
+        last_error = f"Unexpected error in approach 1: {type(e).__name__}: {str(e)[:100]}"
+        logger.warning(f"✗ {last_error}")
+    # Approach 2: Fallback to simple request with enhanced headers if first approach failed
+    if soup is None:
+        try:
+            logger.info(f"Trying fallback approach with enhanced headers...")
+            time.sleep(2)  # Brief delay before retry
+            # More complete headers to mimic a real browser
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'DNT': '1',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'none',
+                'Sec-Fetch-User': '?1',
+                'Cache-Control': 'max-age=0',
+            }
+            response = requests.get(
+                processed_url,
+                headers=headers,
+                timeout=10,
+                allow_redirects=True,
+                verify=False
+            )
+            if response.status_code == 200:
+                logger.info(f"✓ Fallback approach succeeded (status: {response.status_code})")
+                # Decode content with UTF-8 and replace errors to avoid encoding warnings
+                html_content = response.content.decode('utf-8', errors='replace')
+                soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
+            else:
+                last_error = f"HTTP {response.status_code}: {response.reason}"
+                logger.warning(f"✗ Fallback failed with HTTP {response.status_code}")
+        except Exception as e:
+            last_error = f"Fallback error: {type(e).__name__}: {str(e)[:100]}"
+            logger.warning(f"✗ {last_error}")
+    # Approach 3: Use Playwright for dynamic content if previous approaches failed
+    playwright_resources = None
+    if soup is None:
+        try:
+            logger.info(f"Trying Playwright approach for dynamic content...")
+            time.sleep(1)  # Brief delay before retry
+            soup, playwright_resources = fetch_page_with_playwright_safe(processed_url)
+            if soup is not None:
+                logger.info(f"✓ Playwright approach succeeded")
+            else:
+                last_error = "Playwright fetch failed"
+                logger.warning(f"✗ Playwright approach failed")
+        except Exception as e:
+            last_error = f"Playwright error: {type(e).__name__}: {str(e)[:100]}"
+            logger.warning(f"✗ {last_error}")
+    # If all approaches failed, return features with None values
+    if soup is None:
+        error_msg = last_error if last_error else "Unknown error occurred"
+        logger.error(f"  ✗ Failed to extract features from {processed_url}: {error_msg}")
+        print(f"  ✗ Failed to extract features: {error_msg}")
+        return features
+    # Successfully fetched content, now extract features
+    # Use np.nan for parsing errors, 0/1 for missing/present elements
+    # Each feature extraction includes retry logic for robustness
+    # 1. has_title
+    features['has_title'] = extract_feature_with_retry(
+        soup, 'has_title',
+        lambda s: 1 if s.find('title') else 0
+    )
+    # 2. has_input
+    features['has_input'] = extract_feature_with_retry(
+        soup, 'has_input',
+        lambda s: 1 if s.find('input') else 0
+    )
+    # 3. has_button
+    features['has_button'] = extract_feature_with_retry(
+        soup, 'has_button',
+        lambda s: 1 if s.find('button') else 0
+    )
+    # 4. has_image
+    features['has_image'] = extract_feature_with_retry(
+        soup, 'has_image',
+        lambda s: 1 if s.find('img') else 0
+    )
+    # 5. has_submit
+    features['has_submit'] = extract_feature_with_retry(
+        soup, 'has_submit',
+        lambda s: 1 if s.find('input', {'type': 'submit'}) else 0
+    )
+    # 6. has_link
+    features['has_link'] = extract_feature_with_retry(
+        soup, 'has_link',
+        lambda s: 1 if s.find('a') else 0
+    )
+    # 7. has_password
+    features['has_password'] = extract_feature_with_retry(
+        soup, 'has_password',
+        lambda s: 1 if s.find('input', {'type': 'password'}) else 0
+    )
+    # 8. has_email_input
+    features['has_email_input'] = extract_feature_with_retry(
+        soup, 'has_email_input',
+        lambda s: 1 if s.find('input', {'type': 'email'}) else 0
+    )
+    # 9. has_hidden_element
+    features['has_hidden_element'] = extract_feature_with_retry(
+        soup, 'has_hidden_element',
+        lambda s: 1 if s.find('input', {'type': 'hidden'}) else 0
+    )
+    # 10. has_audio
+    features['has_audio'] = extract_feature_with_retry(
+        soup, 'has_audio',
+        lambda s: 1 if s.find('audio') else 0
+    )
+    # 11. has_video
+    features['has_video'] = extract_feature_with_retry(
+        soup, 'has_video',
+        lambda s: 1 if s.find('video') else 0
+    )
+    # 12. number_of_inputs
+    features['number_of_inputs'] = extract_feature_with_retry(
+        soup, 'number_of_inputs',
+        lambda s: len(s.find_all('input'))
+    )
+    # 13. number_of_buttons
+    features['number_of_buttons'] = extract_feature_with_retry(
+        soup, 'number_of_buttons',
+        lambda s: len(s.find_all('button'))
+    )
+    # 14. number_of_images
+    features['number_of_images'] = extract_feature_with_retry(
+        soup, 'number_of_images',
+        lambda s: len(s.find_all('img'))
+    )
+    # 15. number_of_option
+    features['number_of_option'] = extract_feature_with_retry(
+        soup, 'number_of_option',
+        lambda s: len(s.find_all('option'))
+    )
+    # 16. number_of_list
+    features['number_of_list'] = extract_feature_with_retry(
+        soup, 'number_of_list',
+        lambda s: len(s.find_all('li'))
+    )
+    # 17. number_of_th
+    features['number_of_th'] = extract_feature_with_retry(
+        soup, 'number_of_th',
+        lambda s: len(s.find_all('th'))
+    )
+    # 18. number_of_tr
+    features['number_of_tr'] = extract_feature_with_retry(
+        soup, 'number_of_tr',
+        lambda s: len(s.find_all('tr'))
+    )
+    # 19. number_of_href
+    features['number_of_href'] = extract_feature_with_retry(
+        soup, 'number_of_href',
+        lambda s: len(s.find_all('a', href=True))
+    )
+    # 20. number_of_paragraph
+    features['number_of_paragraph'] = extract_feature_with_retry(
+        soup, 'number_of_paragraph',
+        lambda s: len(s.find_all('p'))
+    )
+    # 21. number_of_script
+    features['number_of_script'] = extract_feature_with_retry(
+        soup, 'number_of_script',
+        lambda s: len(s.find_all('script'))
+    )
+    # 22. length_of_title
+    def extract_title_length(s):
+        title_tag = s.find('title')
+        return len(title_tag.get_text()) if title_tag else 0
+    features['length_of_title'] = extract_feature_with_retry(
+        soup, 'length_of_title',
+        extract_title_length
+    )
+    # 23. has_h1
+    features['has_h1'] = extract_feature_with_retry(
+        soup, 'has_h1',
+        lambda s: 1 if s.find('h1') else 0
+    )
+    # 24. has_h2
+    features['has_h2'] = extract_feature_with_retry(
+        soup, 'has_h2',
+        lambda s: 1 if s.find('h2') else 0
+    )
+    # 25. has_h3
+    features['has_h3'] = extract_feature_with_retry(
+        soup, 'has_h3',
+        lambda s: 1 if s.find('h3') else 0
+    )
+    # 26. length_of_text
+    def extract_text_length(s):
+        # Create a copy to avoid modifying the original soup
+        soup_copy = BeautifulSoup(str(s), 'html.parser')
+        for script_or_style in soup_copy(['script', 'style']):
+            script_or_style.decompose()
+        body = soup_copy.find('body')
+        if body:
+            text = body.get_text()
+            return len(text)
+        return 0
+    features['length_of_text'] = extract_feature_with_retry(
+        soup, 'length_of_text',
+        extract_text_length
+    )
+    # 27. number_of_clickable_button
+    def extract_clickable_buttons(s):
+        buttons = len(s.find_all('button'))
+        input_buttons = len(s.find_all('input', {'type': ['button', 'submit', 'reset']}))
+        return buttons + input_buttons
+    features['number_of_clickable_button'] = extract_feature_with_retry(
+        soup, 'number_of_clickable_button',
+        extract_clickable_buttons
+    )
+    # 28. number_of_a
+    features['number_of_a'] = extract_feature_with_retry(
+        soup, 'number_of_a',
+        lambda s: len(s.find_all('a'))
+    )
+    # 29. number_of_img
+    features['number_of_img'] = extract_feature_with_retry(
+        soup, 'number_of_img',
+        lambda s: len(s.find_all('img'))
+    )
+    # 30. number_of_div
+    features['number_of_div'] = extract_feature_with_retry(
+        soup, 'number_of_div',
+        lambda s: len(s.find_all('div'))
+    )
+    # 31. number_of_figure
+    features['number_of_figure'] = extract_feature_with_retry(
+        soup, 'number_of_figure',
+        lambda s: len(s.find_all('figure'))
+    )
+    # 32. has_footer
+    features['has_footer'] = extract_feature_with_retry(
+        soup, 'has_footer',
+        lambda s: 1 if s.find('footer') else 0
+    )
+    # 33. has_form
+    features['has_form'] = extract_feature_with_retry(
+        soup, 'has_form',
+        lambda s: 1 if s.find('form') else 0
+    )
+    # 34. has_text_area
+    features['has_text_area'] = extract_feature_with_retry(
+        soup, 'has_text_area',
+        lambda s: 1 if s.find('textarea') else 0
+    )
+    # 35. has_iframe
+    features['has_iframe'] = extract_feature_with_retry(
+        soup, 'has_iframe',
+        lambda s: 1 if s.find('iframe') else 0
+    )
+    # 36. has_text_input
+    features['has_text_input'] = extract_feature_with_retry(
+        soup, 'has_text_input',
+        lambda s: 1 if s.find('input', {'type': 'text'}) else 0
+    )
+    # 37. number_of_meta
+    features['number_of_meta'] = extract_feature_with_retry(
+        soup, 'number_of_meta',
+        lambda s: len(s.find_all('meta'))
+    )
+    # 38. has_nav
+    features['has_nav'] = extract_feature_with_retry(
+        soup, 'has_nav',
+        lambda s: 1 if s.find('nav') else 0
+    )
+    # 39. has_object
+    features['has_object'] = extract_feature_with_retry(
+        soup, 'has_object',
+        lambda s: 1 if s.find('object') else 0
+    )
+    # 40. has_picture
+    features['has_picture'] = extract_feature_with_retry(
+        soup, 'has_picture',
+        lambda s: 1 if s.find('picture') else 0
+    )
+    # 41. number_of_sources
+    features['number_of_sources'] = extract_feature_with_retry(
+        soup, 'number_of_sources',
+        lambda s: len(s.find_all('source'))
+    )
+    # 42. number_of_span
+    features['number_of_span'] = extract_feature_with_retry(
+        soup, 'number_of_span',
+        lambda s: len(s.find_all('span'))
+    )
+    # 43. number_of_table
+    features['number_of_table'] = extract_feature_with_retry(
+        soup, 'number_of_table',
+        lambda s: len(s.find_all('table'))
+    )
+    # Clean up Playwright resources if they were created
+    if playwright_resources is not None:
+        try:
+            playwright_instance, browser, context, page = playwright_resources
+            if page:
+                page.close()
+            if context:
+                context.close()
+            if browser:
+                browser.close()
+            if playwright_instance:
+                playwright_instance.stop()
+            logger.debug("Playwright resources closed successfully")
+        except Exception as e:
+            logger.debug(f"Error closing Playwright resources: {e}")
+    # Count successfully extracted features
+    # Features with value >= 0 are successfully extracted, -1 indicates failure
+    successful_features = sum(1 for v in features.values() if isinstance(v, int) and v >= 0)
+    failed_features = sum(1 for v in features.values() if v == -1)
+    if failed_features > 0:
+        logger.warning(f"⚠ Extracted {successful_features}/43 features from {processed_url} ({failed_features} failed)")
+    else:
+        logger.info(f"✓ Successfully extracted all 43 features from {processed_url}")
+    return features