Rasel Santillan commited on
Commit
badc9ad
·
1 Parent(s): 4666553
Files changed (2) hide show
  1. model/model.py +10 -23
  2. model/url_feature_extractor.py +830 -240
model/model.py CHANGED
@@ -10,8 +10,8 @@ from typing import Dict, Any, Optional, Tuple
10
  import warnings
11
  from huggingface_hub import hf_hub_download
12
 
13
- # Import feature extraction function and feature names
14
- from .url_feature_extractor import extract_features, FEATURE_NAMES
15
 
16
  warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
17
 
@@ -68,22 +68,13 @@ def load_model() -> Dict[str, Any]:
68
  logger.info(f"Loading model from: {model_path}")
69
  model_data = joblib.load(model_path)
70
 
71
- # Use feature names from the saved model as the source of truth (Option A)
72
- effective_feature_names = model_data["feature_names"]
73
- # Validate against extractor's current schema and log if mismatched
74
- if list(effective_feature_names) != list(FEATURE_NAMES):
75
- logger.warning(
76
- "Saved model feature schema differs from extractor FEATURE_NAMES. "
77
- "Proceeding with saved model schema as authoritative."
78
- )
79
-
80
  # Cache the model
81
  _model_cache = {
82
  "base_models": model_data["base_models"],
83
  "meta_scaler": model_data["meta_scaler"],
84
  "scaler_name": model_data.get("scaler_name", "Unknown"),
85
  "meta_model": model_data["meta_model"],
86
- "feature_names": effective_feature_names,
87
  "model_names": model_data["model_names"]
88
  }
89
 
@@ -119,13 +110,12 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
119
  # Convert to DataFrame to ensure shape consistency
120
  X = pd.DataFrame([features_dict])
121
 
122
- # Ensure all required columns exist (auto-fill missing with 0) and order correctly
123
- missing_cols = list(set(feature_names) - set(X.columns))
124
  if missing_cols:
125
- logger.warning(f" Missing required features in input, filling with 0: {sorted(missing_cols)}")
126
- for col in missing_cols:
127
- X[col] = 0
128
- # Drop any unexpected columns
129
  X = X[feature_names]
130
 
131
  # Level 0: Base model predictions
@@ -303,11 +293,9 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
303
 
304
  # Convert to DataFrame and ensure proper ordering
305
  X = pd.DataFrame([features_dict])
306
- missing_cols = list(set(feature_names) - set(X.columns))
307
  if missing_cols:
308
- logger.warning(f"Missing required features during update, filling with 0: {sorted(missing_cols)}")
309
- for col in missing_cols:
310
- X[col] = 0
311
  X = X[feature_names]
312
 
313
  # Generate meta-features using base models (probability outputs)
@@ -357,7 +345,6 @@ def save_updated_model(model_components: Dict[str, Any], updated_meta_model) ->
357
  "meta_scaler": model_components["meta_scaler"],
358
  "scaler_name": model_components.get("scaler_name", "Unknown"),
359
  "meta_model": updated_meta_model, # Use the updated meta model
360
- # Persist the model's authoritative feature schema
361
  "feature_names": model_components["feature_names"],
362
  "model_names": model_components["model_names"]
363
  }
 
10
  import warnings
11
  from huggingface_hub import hf_hub_download
12
 
13
+ # Import feature extraction function
14
+ from .url_feature_extractor import extract_features
15
 
16
  warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
17
 
 
68
  logger.info(f"Loading model from: {model_path}")
69
  model_data = joblib.load(model_path)
70
 
 
 
 
 
 
 
 
 
 
71
  # Cache the model
72
  _model_cache = {
73
  "base_models": model_data["base_models"],
74
  "meta_scaler": model_data["meta_scaler"],
75
  "scaler_name": model_data.get("scaler_name", "Unknown"),
76
  "meta_model": model_data["meta_model"],
77
+ "feature_names": model_data["feature_names"],
78
  "model_names": model_data["model_names"]
79
  }
80
 
 
110
  # Convert to DataFrame to ensure shape consistency
111
  X = pd.DataFrame([features_dict])
112
 
113
+ # Ensure all required columns exist
114
+ missing_cols = set(feature_names) - set(X.columns)
115
  if missing_cols:
116
+ raise ValueError(f" Missing required features: {missing_cols}")
117
+
118
+ # Keep only known features and order them correctly
 
119
  X = X[feature_names]
120
 
121
  # Level 0: Base model predictions
 
293
 
294
  # Convert to DataFrame and ensure proper ordering
295
  X = pd.DataFrame([features_dict])
296
+ missing_cols = set(feature_names) - set(X.columns)
297
  if missing_cols:
298
+ raise ValueError(f"Missing required features: {missing_cols}")
 
 
299
  X = X[feature_names]
300
 
301
  # Generate meta-features using base models (probability outputs)
 
345
  "meta_scaler": model_components["meta_scaler"],
346
  "scaler_name": model_components.get("scaler_name", "Unknown"),
347
  "meta_model": updated_meta_model, # Use the updated meta model
 
348
  "feature_names": model_components["feature_names"],
349
  "model_names": model_components["model_names"]
350
  }
model/url_feature_extractor.py CHANGED
@@ -1,330 +1,920 @@
1
  """
2
  URL Feature Extraction System for Phishing Detection
3
- Extracts 16 URL-based features for phishing URL classification.
4
- No network requests required - all features extracted from URL string only.
5
  """
6
 
7
- import re
8
- import math
 
 
 
 
9
  import logging
10
- from urllib.parse import urlparse, parse_qs
11
- from typing import Dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Configure logging
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
- # Popular/common TLDs for tld_popularity check
18
- POPULAR_TLDS = {
19
- 'com', 'org', 'net', 'edu', 'gov', 'mil', 'int',
20
- 'co', 'io', 'info', 'biz', 'us', 'uk', 'ca', 'au',
21
- 'de', 'fr', 'jp', 'cn', 'ru', 'br', 'in', 'it', 'es',
22
- 'nl', 'se', 'no', 'fi', 'dk', 'ch', 'at', 'be', 'pl',
23
- 'pt', 'ie', 'nz', 'za', 'mx', 'ar', 'cl', 'kr', 'tw',
24
- 'sg', 'hk', 'my', 'th', 'id', 'ph', 'vn', 'ae', 'sa'
25
- }
26
-
27
- # Suspicious file extensions
28
- SUSPICIOUS_EXTENSIONS = {
29
- '.exe', '.zip', '.scr', '.bat', '.cmd', '.msi', '.dll',
30
- '.pif', '.com', '.vbs', '.js', '.jar', '.wsf', '.ps1',
31
- '.rar', '.7z', '.tar', '.gz', '.iso', '.dmg', '.apk'
32
- }
33
 
 
 
 
34
 
35
- # Regex pattern for detecting IP addresses in URL
36
- IP_ADDRESS_PATTERN = re.compile(
37
- r'^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$'
38
- )
 
 
 
 
39
 
40
- # IPv6 pattern (simplified)
41
- IPV6_PATTERN = re.compile(r'^\[?[0-9a-fA-F:]+\]?$')
 
 
 
42
 
43
 
44
- def preprocess_url(url):
45
  """
46
- Add http:// scheme to URL if missing.
47
 
48
  Args:
49
- url (str): Original URL
 
50
 
51
  Returns:
52
- str: URL with scheme
53
  """
54
- url = url.strip()
55
- if not url.startswith(('http://', 'https://')):
56
- return f'http://{url}'
57
- return url
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
- def calculate_shannon_entropy(text: str) -> int:
61
  """
62
- Calculate Shannon entropy of a string.
63
 
64
  Args:
65
- text: Input string
 
66
 
67
  Returns:
68
- int: Shannon entropy value (multiplied by 100 and rounded for integer output)
69
  """
70
- if not text:
71
- return 0
 
 
72
 
73
- # Calculate character frequencies
74
- freq = {}
75
- for char in text:
76
- freq[char] = freq.get(char, 0) + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- # Calculate entropy
79
- length = len(text)
80
- entropy = 0.0
81
- for count in freq.values():
82
- probability = count / length
83
- if probability > 0:
84
- entropy -= probability * math.log2(probability)
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- # Return entropy * 100 as integer (to preserve precision while keeping int type)
87
- return int(entropy * 100)
88
 
 
 
 
89
 
90
- def extract_tld(hostname: str) -> str:
 
91
  """
92
- Extract TLD from hostname.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  Args:
95
- hostname: Domain hostname (e.g., 'www.example.com')
96
 
97
  Returns:
98
- str: TLD (e.g., 'com')
99
  """
100
- if not hostname:
101
- return ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- # Remove port if present
104
- hostname = hostname.split(':')[0]
 
 
 
 
 
 
 
 
105
 
106
- parts = hostname.split('.')
107
- if len(parts) >= 1:
108
- return parts[-1].lower()
109
- return ''
110
 
111
 
112
- def extract_domain_name(hostname: str) -> str:
113
  """
114
- Extract the main domain name (excluding subdomains and TLD).
115
 
116
  Args:
117
- hostname: Domain hostname (e.g., 'www.mail.example.com')
118
 
119
  Returns:
120
- str: Domain name only (e.g., 'example')
121
  """
122
- if not hostname:
123
- return ''
124
 
125
- # Remove port if present
126
- hostname = hostname.split(':')[0]
 
 
 
 
 
 
127
 
128
- parts = hostname.split('.')
129
- if len(parts) >= 2:
130
- # Return second-to-last part (domain name)
131
- return parts[-2]
132
- elif len(parts) == 1:
133
- return parts[0]
134
- return ''
135
 
 
136
 
137
- def count_subdomains(hostname: str) -> int:
 
138
  """
139
- Count the number of subdomains in hostname.
140
 
141
  Args:
142
- hostname: Domain hostname (e.g., 'www.mail.example.com')
143
 
144
  Returns:
145
- int: Number of subdomains (e.g., 2 for www.mail.example.com)
146
  """
147
- if not hostname:
148
- return 0
149
-
150
- # Remove port if present
151
- hostname = hostname.split(':')[0]
152
-
153
- parts = hostname.split('.')
154
- # Subdomains = total parts - TLD - domain name
155
- # e.g., www.mail.example.com has 4 parts, so 4 - 2 = 2 subdomains
156
- if len(parts) > 2:
157
- return len(parts) - 2
158
- return 0
159
 
160
 
161
- def has_ip_address_in_url(hostname: str) -> int:
162
  """
163
- Check if the hostname is an IP address.
 
 
 
 
 
164
 
165
  Args:
166
- hostname: Domain hostname
 
 
 
167
 
168
  Returns:
169
- int: 1 if IP address, 0 otherwise
170
  """
171
- if not hostname:
172
- return 0
173
 
174
- # Remove port if present
175
- hostname = hostname.split(':')[0]
 
176
 
177
- # Check IPv4
178
- if IP_ADDRESS_PATTERN.match(hostname):
179
- return 1
 
 
 
180
 
181
- # Check IPv6
182
- if IPV6_PATTERN.match(hostname):
183
- return 1
184
 
185
- return 0
 
 
 
 
 
186
 
 
 
 
 
 
187
 
188
- def extract_features(url: str) -> Dict[str, int]:
189
- """
190
- Extract 16 URL-based features from a URL string.
191
 
192
- No network requests are made - all features are extracted from the URL string only.
 
 
193
 
194
  Args:
195
  url (str): URL to extract features from
196
 
197
  Returns:
198
- dict: Dictionary containing 16 features as integers.
199
- - Binary flags: 0 or 1
200
- - Counts and lengths: >= 0
201
- - Percentages: 0-100 (for percentage_numeric_chars)
202
- - Entropy: 0-800 (shannon entropy * 100)
203
  """
204
- # Initialize features with default values
 
205
  features = {
206
- 'url_length': 0,
207
- 'has_ip_address': 0,
208
- 'dot_count': 0,
209
- 'https_flag': 0,
210
- 'url_entropy': 0,
211
- 'token_count': 0,
212
- 'subdomain_count': 0,
213
- 'query_param_count': 0,
214
- 'tld_length': 0,
215
- 'path_length': 0,
216
- 'has_hyphen_in_domain': 0,
217
- 'number_of_digits': 0,
218
- 'tld_popularity': 0,
219
- 'suspicious_file_extension': 0,
220
- 'domain_name_length': 0,
221
- 'percentage_numeric_chars': 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  }
223
 
224
- try:
225
- # Preprocess URL to ensure it has a scheme
226
- processed_url = preprocess_url(url)
227
 
228
- # Parse URL
229
- parsed = urlparse(processed_url)
230
- hostname = parsed.netloc or ''
231
- path = parsed.path or ''
232
- query = parsed.query or ''
233
 
234
- # 1. url_length - Total number of characters in the URL
235
- features['url_length'] = len(processed_url)
236
-
237
- # 2. has_ip_address - Whether URL contains IP address instead of domain
238
- features['has_ip_address'] = has_ip_address_in_url(hostname)
239
-
240
- # 3. dot_count - Number of dots in URL
241
- features['dot_count'] = processed_url.count('.')
242
-
243
- # 4. https_flag - Whether URL uses HTTPS
244
- features['https_flag'] = 1 if parsed.scheme.lower() == 'https' else 0
245
-
246
- # 5. url_entropy - Shannon entropy of URL string (x100 for int)
247
- features['url_entropy'] = calculate_shannon_entropy(processed_url)
248
-
249
- # 6. token_count - Number of tokens separated by delimiters
250
- # Delimiters: /, -, _, ., ?, &, =
251
- tokens = re.split(r'[/\-_\.?&=]+', processed_url)
252
- # Filter out empty tokens and scheme
253
- tokens = [t for t in tokens if t and t not in ('http', 'https', '')]
254
- features['token_count'] = len(tokens)
255
-
256
- # 7. subdomain_count - Number of subdomains
257
- features['subdomain_count'] = count_subdomains(hostname)
258
-
259
- # 8. query_param_count - Number of query parameters
260
- if query:
261
- query_params = parse_qs(query, keep_blank_values=True)
262
- features['query_param_count'] = len(query_params)
263
- else:
264
- features['query_param_count'] = 0
265
-
266
- # 9. tld_length - Length of TLD
267
- tld = extract_tld(hostname)
268
- features['tld_length'] = len(tld)
269
-
270
- # 10. path_length - Length of path portion
271
- features['path_length'] = len(path)
272
-
273
- # 11. has_hyphen_in_domain - Whether domain contains hyphen
274
- domain_name = extract_domain_name(hostname)
275
- features['has_hyphen_in_domain'] = 1 if '-' in domain_name else 0
276
-
277
- # 12. number_of_digits - Total count of digits in URL
278
- features['number_of_digits'] = sum(1 for c in processed_url if c.isdigit())
279
-
280
- # 13. tld_popularity - Whether TLD is popular/common
281
- features['tld_popularity'] = 1 if tld.lower() in POPULAR_TLDS else 0
282
-
283
- # 14. suspicious_file_extension - Whether URL ends with suspicious extension
284
- url_lower = processed_url.lower()
285
- has_suspicious = 0
286
- for ext in SUSPICIOUS_EXTENSIONS:
287
- if url_lower.endswith(ext):
288
- has_suspicious = 1
289
- break
290
- features['suspicious_file_extension'] = has_suspicious
291
-
292
- # 15. domain_name_length - Length of domain name only
293
- features['domain_name_length'] = len(domain_name)
294
-
295
- # 16. percentage_numeric_chars - Percentage of numeric chars (0-100)
296
- if len(processed_url) > 0:
297
- digit_count = sum(1 for c in processed_url if c.isdigit())
298
- percentage = (digit_count / len(processed_url)) * 100
299
- features['percentage_numeric_chars'] = int(percentage)
300
  else:
301
- features['percentage_numeric_chars'] = 0
302
-
303
- logger.info(f"✓ Successfully extracted 16 URL features from: {url}")
304
-
 
 
 
 
 
 
 
 
 
 
 
305
  except Exception as e:
306
- logger.error(f" Error extracting features from {url}: {type(e).__name__}: {str(e)}")
307
- # Return default values on error
308
-
309
- return features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
- # List of feature names for external reference
313
- FEATURE_NAMES = [
314
- 'url_length',
315
- 'has_ip_address',
316
- 'dot_count',
317
- 'https_flag',
318
- 'url_entropy',
319
- 'token_count',
320
- 'subdomain_count',
321
- 'query_param_count',
322
- 'tld_length',
323
- 'path_length',
324
- 'has_hyphen_in_domain',
325
- 'number_of_digits',
326
- 'tld_popularity',
327
- 'suspicious_file_extension',
328
- 'domain_name_length',
329
- 'percentage_numeric_chars'
330
- ]
 
1
  """
2
  URL Feature Extraction System for Phishing Detection
3
+ Extracts 43 specific features from URLs and their corresponding webpages.
 
4
  """
5
 
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import pandas as pd
9
+ from urllib.parse import urlparse
10
+ import warnings
11
+ import time
12
  import logging
13
+ import numpy as np
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
16
+ from functools import wraps
17
+ import asyncio
18
+ import sys
19
+
20
+ # Playwright imports (optional - graceful degradation if not installed)
21
+ try:
22
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
23
+ PLAYWRIGHT_AVAILABLE = True
24
+ except ImportError:
25
+ PLAYWRIGHT_AVAILABLE = False
26
+ PlaywrightTimeoutError = Exception # Fallback for type hints
27
+
28
+ warnings.filterwarnings('ignore')
29
 
30
  # Configure logging
31
  logging.basicConfig(level=logging.INFO)
32
  logger = logging.getLogger(__name__)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ def _is_running_in_event_loop():
36
+ """
37
+ Check if code is running inside an asyncio event loop.
38
 
39
+ Returns:
40
+ bool: True if running in an event loop, False otherwise
41
+ """
42
+ try:
43
+ asyncio.get_running_loop()
44
+ return True
45
+ except RuntimeError:
46
+ return False
47
 
48
+ # Configuration constants
49
+ FEATURE_EXTRACTION_MAX_RETRIES = 3
50
+ FEATURE_EXTRACTION_RETRY_DELAY = 0.3 # seconds between retries
51
+ PAGE_LOAD_TIMEOUT = 20 # seconds to wait for page load
52
+ DYNAMIC_CONTENT_WAIT = 3 # seconds to wait for dynamic content after page load
53
 
54
 
55
+ def retry_feature_extraction(max_retries=FEATURE_EXTRACTION_MAX_RETRIES, delay=FEATURE_EXTRACTION_RETRY_DELAY):
56
  """
57
+ Decorator to retry feature extraction with exponential backoff.
58
 
59
  Args:
60
+ max_retries (int): Maximum number of retry attempts
61
+ delay (float): Initial delay between retries in seconds
62
 
63
  Returns:
64
+ Decorated function with retry logic
65
  """
66
+ def decorator(func):
67
+ @wraps(func)
68
+ def wrapper(*args, **kwargs):
69
+ last_exception = None
70
+ for attempt in range(max_retries):
71
+ try:
72
+ result = func(*args, **kwargs)
73
+ # If we got a valid result (not np.nan), return it
74
+ if result is not None and not (isinstance(result, float) and np.isnan(result)):
75
+ return result
76
+ # If result is np.nan or None, retry
77
+ if attempt < max_retries - 1:
78
+ time.sleep(delay * (attempt + 1)) # Exponential backoff
79
+ except Exception as e:
80
+ last_exception = e
81
+ if attempt < max_retries - 1:
82
+ time.sleep(delay * (attempt + 1))
83
+ continue
84
+
85
+ # All retries exhausted, return np.nan
86
+ if last_exception:
87
+ logger.debug(f"Feature extraction failed after {max_retries} attempts: {last_exception}")
88
+ return np.nan
89
+ return wrapper
90
+ return decorator
91
+
92
+
93
+ def create_playwright_browser():
94
+ """
95
+ Create a Playwright browser context for dynamic content extraction.
96
+
97
+ Returns:
98
+ tuple: (playwright instance, browser, context, page) or (None, None, None, None) if failed
99
+ """
100
+ if not PLAYWRIGHT_AVAILABLE:
101
+ logger.warning("Playwright is not installed. Install with: pip install playwright && playwright install")
102
+ return None, None, None, None
103
+
104
+ try:
105
+ # Start Playwright
106
+ playwright = sync_playwright().start()
107
+
108
+ # Launch browser with stealth options
109
+ browser = playwright.chromium.launch(
110
+ headless=True,
111
+ args=[
112
+ '--no-sandbox',
113
+ '--disable-dev-shm-usage',
114
+ '--disable-gpu',
115
+ '--disable-extensions',
116
+ '--disable-blink-features=AutomationControlled',
117
+ ]
118
+ )
119
+
120
+ # Create context with stealth settings
121
+ context = browser.new_context(
122
+ viewport={'width': 1920, 'height': 1080},
123
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
124
+ locale='en-US',
125
+ timezone_id='America/New_York',
126
+ permissions=[],
127
+ extra_http_headers={
128
+ 'Accept-Language': 'en-US,en;q=0.9',
129
+ 'DNT': '1',
130
+ },
131
+ ignore_https_errors=True,
132
+ )
133
+
134
+ # Add init script to hide webdriver property
135
+ context.add_init_script("""
136
+ Object.defineProperty(navigator, 'webdriver', {
137
+ get: () => undefined
138
+ });
139
+
140
+ // Override the navigator.plugins to avoid detection
141
+ Object.defineProperty(navigator, 'plugins', {
142
+ get: () => [1, 2, 3, 4, 5]
143
+ });
144
+
145
+ // Override the navigator.languages to avoid detection
146
+ Object.defineProperty(navigator, 'languages', {
147
+ get: () => ['en-US', 'en']
148
+ });
149
+ """)
150
+
151
+ # Create a new page
152
+ page = context.new_page()
153
+
154
+ # Set default timeout
155
+ page.set_default_timeout(PAGE_LOAD_TIMEOUT * 1000) # Convert to milliseconds
156
+
157
+ logger.info("✓ Playwright browser created successfully")
158
+ return playwright, browser, context, page
159
+
160
+ except Exception as e:
161
+ logger.warning(f"Failed to create Playwright browser: {type(e).__name__}: {str(e)[:200]}")
162
+ logger.info("Playwright will be skipped. Install with: pip install playwright && playwright install")
163
+ return None, None, None, None
164
 
165
 
166
+ def fetch_page_with_playwright(url, page=None):
167
  """
168
+ Fetch a webpage using Playwright to handle dynamic JavaScript content.
169
 
170
  Args:
171
+ url (str): URL to fetch
172
+ page (playwright.sync_api.Page, optional): Existing page instance
173
 
174
  Returns:
175
+ tuple: (BeautifulSoup object, (playwright, browser, context, page)) or (None, None) if failed
176
  """
177
+ resources_created = False
178
+ playwright_instance = None
179
+ browser = None
180
+ context = None
181
 
182
+ try:
183
+ if page is None:
184
+ playwright_instance, browser, context, page = create_playwright_browser()
185
+ resources_created = True
186
+
187
+ if page is None:
188
+ return None, None
189
+
190
+ logger.info(f"Fetching URL with Playwright: {url}")
191
+
192
+ # Navigate to the URL
193
+ try:
194
+ response = page.goto(url, wait_until='networkidle', timeout=PAGE_LOAD_TIMEOUT * 1000)
195
+
196
+ # Check if navigation was successful
197
+ if response and response.status >= 400:
198
+ logger.warning(f"Playwright received HTTP {response.status}")
199
+ except PlaywrightTimeoutError:
200
+ logger.warning("Playwright navigation timeout, continuing anyway...")
201
+ except Exception as nav_error:
202
+ logger.warning(f"Playwright navigation error: {nav_error}")
203
+ # Continue anyway - page might have partially loaded
204
+
205
+ # Wait for document ready state
206
+ try:
207
+ page.wait_for_load_state('domcontentloaded', timeout=10000)
208
+ page.wait_for_load_state('load', timeout=10000)
209
+ except PlaywrightTimeoutError:
210
+ logger.debug("Load state timeout, continuing...")
211
+
212
+ # Additional wait for dynamic content to load
213
+ time.sleep(DYNAMIC_CONTENT_WAIT)
214
+
215
+ # Wait for body element to be present
216
+ try:
217
+ page.wait_for_selector('body', timeout=10000)
218
+ except PlaywrightTimeoutError:
219
+ logger.debug("Body selector timeout, continuing...")
220
+
221
+ # Get the fully rendered page source
222
+ page_source = page.content()
223
+
224
+ # Parse with BeautifulSoup
225
+ soup = BeautifulSoup(page_source, 'html.parser')
226
+
227
+ logger.info(f"✓ Successfully fetched and rendered page with Playwright")
228
+
229
+ # Return soup and resources (let caller handle cleanup)
230
+ if resources_created:
231
+ return soup, (playwright_instance, browser, context, page)
232
+ else:
233
+ return soup, None
234
 
235
+ except Exception as e:
236
+ logger.warning(f"Playwright fetch failed: {type(e).__name__}: {str(e)[:100]}")
237
+ if resources_created:
238
+ try:
239
+ if page:
240
+ page.close()
241
+ if context:
242
+ context.close()
243
+ if browser:
244
+ browser.close()
245
+ if playwright_instance:
246
+ playwright_instance.stop()
247
+ except:
248
+ pass
249
+ return None, None
250
+
251
+
252
+ def fetch_page_with_playwright_safe(url, page=None):
253
+ """
254
+ Thread-safe wrapper for fetch_page_with_playwright that works in both sync and async contexts.
255
 
256
+ This function detects if it's running inside an asyncio event loop (e.g., FastAPI/uvicorn)
257
+ and automatically runs the Playwright sync API in a separate thread to avoid conflicts.
258
 
259
+ Args:
260
+ url (str): URL to fetch
261
+ page (playwright.sync_api.Page, optional): Existing page instance
262
 
263
+ Returns:
264
+ tuple: (BeautifulSoup object, playwright_resources) or (None, None) if failed
265
  """
266
+ if _is_running_in_event_loop():
267
+ # Running in async context (e.g., FastAPI) - use thread pool
268
+ logger.debug("Detected async context - running Playwright in separate thread")
269
+ try:
270
+ # Run the sync function in a thread pool executor
271
+ # This isolates Playwright's sync API from the asyncio event loop
272
+ import concurrent.futures
273
+ with concurrent.futures.ThreadPoolExecutor() as executor:
274
+ future = executor.submit(fetch_page_with_playwright, url, page)
275
+ result = future.result(timeout=PAGE_LOAD_TIMEOUT + 30) # Add buffer to timeout
276
+ return result
277
+ except Exception as e:
278
+ logger.warning(f"Failed to run Playwright in thread: {type(e).__name__}: {str(e)[:100]}")
279
+ return None, None
280
+ else:
281
+ # Running in sync context (e.g., direct script execution) - call directly
282
+ logger.debug("Detected sync context - running Playwright directly")
283
+ return fetch_page_with_playwright(url, page)
284
+
285
+
286
+ def get_modern_browser_headers(url=None):
287
+ """
288
+ Generate modern browser headers to mimic a real Chrome browser.
289
 
290
  Args:
291
+ url (str, optional): The target URL for setting referer/origin
292
 
293
  Returns:
294
+ dict: Dictionary of HTTP headers
295
  """
296
+ headers = {
297
+ # Modern Chrome User-Agent (Chrome 120+)
298
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
299
+
300
+ # Accept headers
301
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
302
+ 'Accept-Language': 'en-US,en;q=0.9',
303
+ 'Accept-Encoding': 'gzip, deflate, br',
304
+
305
+ # Security headers (Sec-Fetch-* headers)
306
+ 'Sec-Fetch-Dest': 'document',
307
+ 'Sec-Fetch-Mode': 'navigate',
308
+ 'Sec-Fetch-Site': 'none',
309
+ 'Sec-Fetch-User': '?1',
310
+
311
+ # Additional browser headers
312
+ 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
313
+ 'Sec-Ch-Ua-Mobile': '?0',
314
+ 'Sec-Ch-Ua-Platform': '"Windows"',
315
+
316
+ # Connection settings
317
+ 'Connection': 'keep-alive',
318
+ 'Upgrade-Insecure-Requests': '1',
319
+
320
+ # DNT (Do Not Track)
321
+ 'DNT': '1',
322
+
323
+ # Cache control
324
+ 'Cache-Control': 'max-age=0',
325
+ }
326
 
327
+ # Add referer if URL is provided
328
+ if url:
329
+ try:
330
+ parsed = urlparse(url)
331
+ if parsed.scheme and parsed.netloc:
332
+ origin = f"{parsed.scheme}://{parsed.netloc}"
333
+ headers['Origin'] = origin
334
+ headers['Referer'] = origin + '/'
335
+ except Exception:
336
+ pass
337
 
338
+ return headers
 
 
 
339
 
340
 
341
+ def create_session_with_retries(max_retries=3):
342
  """
343
+ Create a requests session with retry logic and connection pooling.
344
 
345
  Args:
346
+ max_retries (int): Maximum number of retries for failed requests
347
 
348
  Returns:
349
+ requests.Session: Configured session object
350
  """
351
+ session = requests.Session()
 
352
 
353
+ # Configure retry strategy
354
+ retry_strategy = Retry(
355
+ total=max_retries,
356
+ backoff_factor=1, # Wait 1s, 2s, 4s between retries
357
+ status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes
358
+ allowed_methods=["GET", "HEAD"], # Only retry safe methods
359
+ raise_on_status=False # Don't raise exception, let us handle it
360
+ )
361
 
362
+ # Mount adapter with retry strategy
363
+ adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=10)
364
+ session.mount("http://", adapter)
365
+ session.mount("https://", adapter)
 
 
 
366
 
367
+ return session
368
 
369
+
370
+ def preprocess_url(url):
371
  """
372
+ Add http:// scheme to URL if missing.
373
 
374
  Args:
375
+ url (str): Original URL
376
 
377
  Returns:
378
+ str: URL with scheme
379
  """
380
+ url = url.strip()
381
+ if not url.startswith(('http://', 'https://')):
382
+ return f'http://{url}'
383
+ return url
 
 
 
 
 
 
 
 
384
 
385
 
386
+ def extract_feature_with_retry(soup, feature_name, extraction_func, max_retries=FEATURE_EXTRACTION_MAX_RETRIES):
387
  """
388
+ Extract a single feature with retry logic.
389
+
390
+ All features are returned as integers:
391
+ - 'has_*' features return binary 0 or 1
392
+ - 'number_of_*' and 'length_of_*' features return whole numbers (integers)
393
+ - On failure, returns -1 (instead of np.nan) to maintain integer type consistency
394
 
395
  Args:
396
+ soup (BeautifulSoup): Parsed HTML content
397
+ feature_name (str): Name of the feature being extracted
398
+ extraction_func (callable): Function that performs the extraction
399
+ max_retries (int): Maximum number of retry attempts
400
 
401
  Returns:
402
+ int: Feature value as integer, or -1 if all retries fail
403
  """
404
+ last_exception = None
 
405
 
406
+ for attempt in range(max_retries):
407
+ try:
408
+ result = extraction_func(soup)
409
 
410
+ # If we got a valid result, cast to int and return it
411
+ if result is not None and not (isinstance(result, float) and np.isnan(result)):
412
+ if attempt > 0:
413
+ logger.debug(f"Feature '{feature_name}' extracted successfully on attempt {attempt + 1}")
414
+ # Ensure integer type for all features
415
+ return int(result)
416
 
417
+ # If result is None or np.nan, retry with a small delay
418
+ if attempt < max_retries - 1:
419
+ time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
420
 
421
+ except Exception as e:
422
+ last_exception = e
423
+ if attempt < max_retries - 1:
424
+ logger.debug(f"Retry {attempt + 1}/{max_retries} for '{feature_name}': {type(e).__name__}")
425
+ time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
426
+ continue
427
 
428
+ # All retries exhausted - return -1 to indicate failure while maintaining integer type
429
+ if last_exception:
430
+ logger.debug(f"Error extracting {feature_name} after {max_retries} attempts: {last_exception}")
431
+
432
+ return -1
433
 
 
 
 
434
 
435
+ def extract_features(url):
436
+ """
437
+ Extract all 43 features from a URL and its webpage.
438
 
439
  Args:
440
  url (str): URL to extract features from
441
 
442
  Returns:
443
+ dict: Dictionary containing all 43 features as integers.
444
+ - 'has_*' features: 0 (not present), 1 (present), or -1 (extraction failed/unreachable)
445
+ - 'number_of_*' and 'length_of_*' features: >= 0 count/length, or -1 (extraction failed/unreachable)
 
 
446
  """
447
+ # Initialize all features with -1 (for unreachable sites)
448
+ # Using -1 instead of None to maintain integer type consistency
449
  features = {
450
+ 'has_title': -1,
451
+ 'has_input': -1,
452
+ 'has_button': -1,
453
+ 'has_image': -1,
454
+ 'has_submit': -1,
455
+ 'has_link': -1,
456
+ 'has_password': -1,
457
+ 'has_email_input': -1,
458
+ 'has_hidden_element': -1,
459
+ 'has_audio': -1,
460
+ 'has_video': -1,
461
+ 'number_of_inputs': -1,
462
+ 'number_of_buttons': -1,
463
+ 'number_of_images': -1,
464
+ 'number_of_option': -1,
465
+ 'number_of_list': -1,
466
+ 'number_of_th': -1,
467
+ 'number_of_tr': -1,
468
+ 'number_of_href': -1,
469
+ 'number_of_paragraph': -1,
470
+ 'number_of_script': -1,
471
+ 'length_of_title': -1,
472
+ 'has_h1': -1,
473
+ 'has_h2': -1,
474
+ 'has_h3': -1,
475
+ 'length_of_text': -1,
476
+ 'number_of_clickable_button': -1,
477
+ 'number_of_a': -1,
478
+ 'number_of_img': -1,
479
+ 'number_of_div': -1,
480
+ 'number_of_figure': -1,
481
+ 'has_footer': -1,
482
+ 'has_form': -1,
483
+ 'has_text_area': -1,
484
+ 'has_iframe': -1,
485
+ 'has_text_input': -1,
486
+ 'number_of_meta': -1,
487
+ 'has_nav': -1,
488
+ 'has_object': -1,
489
+ 'has_picture': -1,
490
+ 'number_of_sources': -1,
491
+ 'number_of_span': -1,
492
+ 'number_of_table': -1
493
  }
494
 
495
+ # Preprocess URL
496
+ processed_url = preprocess_url(url)
 
497
 
498
+ # Try multiple approaches with increasing robustness
499
+ response = None
500
+ soup = None
501
+ last_error = None
 
502
 
503
+ # Approach 1: Use session with retry logic and modern headers
504
+ try:
505
+ logger.info(f"Attempting to fetch URL with session and retries: {processed_url}")
506
+ session = create_session_with_retries(max_retries=3)
507
+ headers = get_modern_browser_headers(processed_url)
508
+
509
+ response = session.get(
510
+ processed_url,
511
+ headers=headers,
512
+ timeout=15,
513
+ allow_redirects=True,
514
+ verify=False
515
+ )
516
+
517
+ # Check if we got a successful response
518
+ if response.status_code == 200:
519
+ logger.info(f"✓ Successfully fetched URL (status: {response.status_code})")
520
+ # Decode content with UTF-8 and replace errors to avoid encoding warnings
521
+ html_content = response.content.decode('utf-8', errors='replace')
522
+ soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
  else:
524
+ logger.warning(f"Received HTTP {response.status_code} for {processed_url}")
525
+ raise requests.exceptions.HTTPError(f"HTTP {response.status_code}")
526
+
527
+ except requests.exceptions.Timeout as e:
528
+ last_error = f"Timeout error: Request took longer than 15 seconds"
529
+ logger.warning(f"✗ {last_error}")
530
+ except requests.exceptions.ConnectionError as e:
531
+ last_error = f"Connection error: Unable to establish connection to {processed_url}"
532
+ logger.warning(f"✗ {last_error}")
533
+ except requests.exceptions.HTTPError as e:
534
+ last_error = f"HTTP error: {str(e)}"
535
+ logger.warning(f"✗ {last_error}")
536
+ except requests.exceptions.TooManyRedirects as e:
537
+ last_error = f"Too many redirects: URL redirected too many times"
538
+ logger.warning(f"✗ {last_error}")
539
  except Exception as e:
540
+ last_error = f"Unexpected error in approach 1: {type(e).__name__}: {str(e)[:100]}"
541
+ logger.warning(f"✗ {last_error}")
542
+
543
+ # Approach 2: Fallback to simple request with enhanced headers if first approach failed
544
+ if soup is None:
545
+ try:
546
+ logger.info(f"Trying fallback approach with enhanced headers...")
547
+ time.sleep(2) # Brief delay before retry
548
+
549
+ # More complete headers to mimic a real browser
550
+ headers = {
551
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
552
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
553
+ 'Accept-Language': 'en-US,en;q=0.9',
554
+ 'Accept-Encoding': 'gzip, deflate, br',
555
+ 'DNT': '1',
556
+ 'Connection': 'keep-alive',
557
+ 'Upgrade-Insecure-Requests': '1',
558
+ 'Sec-Fetch-Dest': 'document',
559
+ 'Sec-Fetch-Mode': 'navigate',
560
+ 'Sec-Fetch-Site': 'none',
561
+ 'Sec-Fetch-User': '?1',
562
+ 'Cache-Control': 'max-age=0',
563
+ }
564
+
565
+ response = requests.get(
566
+ processed_url,
567
+ headers=headers,
568
+ timeout=10,
569
+ allow_redirects=True,
570
+ verify=False
571
+ )
572
+
573
+ if response.status_code == 200:
574
+ logger.info(f"✓ Fallback approach succeeded (status: {response.status_code})")
575
+ # Decode content with UTF-8 and replace errors to avoid encoding warnings
576
+ html_content = response.content.decode('utf-8', errors='replace')
577
+ soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
578
+ else:
579
+ last_error = f"HTTP {response.status_code}: {response.reason}"
580
+ logger.warning(f"✗ Fallback failed with HTTP {response.status_code}")
581
+
582
+ except Exception as e:
583
+ last_error = f"Fallback error: {type(e).__name__}: {str(e)[:100]}"
584
+ logger.warning(f"✗ {last_error}")
585
+
586
+ # Approach 3: Use Playwright for dynamic content if previous approaches failed
587
+ playwright_resources = None
588
+ if soup is None:
589
+ try:
590
+ logger.info(f"Trying Playwright approach for dynamic content...")
591
+ time.sleep(1) # Brief delay before retry
592
+
593
+ soup, playwright_resources = fetch_page_with_playwright_safe(processed_url)
594
+
595
+ if soup is not None:
596
+ logger.info(f"✓ Playwright approach succeeded")
597
+ else:
598
+ last_error = "Playwright fetch failed"
599
+ logger.warning(f"✗ Playwright approach failed")
600
+
601
+ except Exception as e:
602
+ last_error = f"Playwright error: {type(e).__name__}: {str(e)[:100]}"
603
+ logger.warning(f"✗ {last_error}")
604
+
605
+ # If all approaches failed, return features with None values
606
+ if soup is None:
607
+ error_msg = last_error if last_error else "Unknown error occurred"
608
+ logger.error(f" ✗ Failed to extract features from {processed_url}: {error_msg}")
609
+ print(f" ✗ Failed to extract features: {error_msg}")
610
+ return features
611
+
612
+ # Successfully fetched content, now extract features
613
+ # Use np.nan for parsing errors, 0/1 for missing/present elements
614
+ # Each feature extraction includes retry logic for robustness
615
+
616
+ # 1. has_title
617
+ features['has_title'] = extract_feature_with_retry(
618
+ soup, 'has_title',
619
+ lambda s: 1 if s.find('title') else 0
620
+ )
621
+
622
+ # 2. has_input
623
+ features['has_input'] = extract_feature_with_retry(
624
+ soup, 'has_input',
625
+ lambda s: 1 if s.find('input') else 0
626
+ )
627
+
628
+ # 3. has_button
629
+ features['has_button'] = extract_feature_with_retry(
630
+ soup, 'has_button',
631
+ lambda s: 1 if s.find('button') else 0
632
+ )
633
+
634
+ # 4. has_image
635
+ features['has_image'] = extract_feature_with_retry(
636
+ soup, 'has_image',
637
+ lambda s: 1 if s.find('img') else 0
638
+ )
639
+
640
+ # 5. has_submit
641
+ features['has_submit'] = extract_feature_with_retry(
642
+ soup, 'has_submit',
643
+ lambda s: 1 if s.find('input', {'type': 'submit'}) else 0
644
+ )
645
+
646
+ # 6. has_link
647
+ features['has_link'] = extract_feature_with_retry(
648
+ soup, 'has_link',
649
+ lambda s: 1 if s.find('a') else 0
650
+ )
651
+
652
+ # 7. has_password
653
+ features['has_password'] = extract_feature_with_retry(
654
+ soup, 'has_password',
655
+ lambda s: 1 if s.find('input', {'type': 'password'}) else 0
656
+ )
657
+
658
+ # 8. has_email_input
659
+ features['has_email_input'] = extract_feature_with_retry(
660
+ soup, 'has_email_input',
661
+ lambda s: 1 if s.find('input', {'type': 'email'}) else 0
662
+ )
663
+
664
+ # 9. has_hidden_element
665
+ features['has_hidden_element'] = extract_feature_with_retry(
666
+ soup, 'has_hidden_element',
667
+ lambda s: 1 if s.find('input', {'type': 'hidden'}) else 0
668
+ )
669
+
670
+ # 10. has_audio
671
+ features['has_audio'] = extract_feature_with_retry(
672
+ soup, 'has_audio',
673
+ lambda s: 1 if s.find('audio') else 0
674
+ )
675
+
676
+ # 11. has_video
677
+ features['has_video'] = extract_feature_with_retry(
678
+ soup, 'has_video',
679
+ lambda s: 1 if s.find('video') else 0
680
+ )
681
+
682
+ # 12. number_of_inputs
683
+ features['number_of_inputs'] = extract_feature_with_retry(
684
+ soup, 'number_of_inputs',
685
+ lambda s: len(s.find_all('input'))
686
+ )
687
+
688
+ # 13. number_of_buttons
689
+ features['number_of_buttons'] = extract_feature_with_retry(
690
+ soup, 'number_of_buttons',
691
+ lambda s: len(s.find_all('button'))
692
+ )
693
+
694
+ # 14. number_of_images
695
+ features['number_of_images'] = extract_feature_with_retry(
696
+ soup, 'number_of_images',
697
+ lambda s: len(s.find_all('img'))
698
+ )
699
+
700
+ # 15. number_of_option
701
+ features['number_of_option'] = extract_feature_with_retry(
702
+ soup, 'number_of_option',
703
+ lambda s: len(s.find_all('option'))
704
+ )
705
+
706
+ # 16. number_of_list
707
+ features['number_of_list'] = extract_feature_with_retry(
708
+ soup, 'number_of_list',
709
+ lambda s: len(s.find_all('li'))
710
+ )
711
+
712
+ # 17. number_of_th
713
+ features['number_of_th'] = extract_feature_with_retry(
714
+ soup, 'number_of_th',
715
+ lambda s: len(s.find_all('th'))
716
+ )
717
+
718
+ # 18. number_of_tr
719
+ features['number_of_tr'] = extract_feature_with_retry(
720
+ soup, 'number_of_tr',
721
+ lambda s: len(s.find_all('tr'))
722
+ )
723
+
724
+ # 19. number_of_href
725
+ features['number_of_href'] = extract_feature_with_retry(
726
+ soup, 'number_of_href',
727
+ lambda s: len(s.find_all('a', href=True))
728
+ )
729
+
730
+ # 20. number_of_paragraph
731
+ features['number_of_paragraph'] = extract_feature_with_retry(
732
+ soup, 'number_of_paragraph',
733
+ lambda s: len(s.find_all('p'))
734
+ )
735
+
736
+ # 21. number_of_script
737
+ features['number_of_script'] = extract_feature_with_retry(
738
+ soup, 'number_of_script',
739
+ lambda s: len(s.find_all('script'))
740
+ )
741
+
742
+ # 22. length_of_title
743
+ def extract_title_length(s):
744
+ title_tag = s.find('title')
745
+ return len(title_tag.get_text()) if title_tag else 0
746
+
747
+ features['length_of_title'] = extract_feature_with_retry(
748
+ soup, 'length_of_title',
749
+ extract_title_length
750
+ )
751
+
752
+ # 23. has_h1
753
+ features['has_h1'] = extract_feature_with_retry(
754
+ soup, 'has_h1',
755
+ lambda s: 1 if s.find('h1') else 0
756
+ )
757
+
758
+ # 24. has_h2
759
+ features['has_h2'] = extract_feature_with_retry(
760
+ soup, 'has_h2',
761
+ lambda s: 1 if s.find('h2') else 0
762
+ )
763
+
764
+ # 25. has_h3
765
+ features['has_h3'] = extract_feature_with_retry(
766
+ soup, 'has_h3',
767
+ lambda s: 1 if s.find('h3') else 0
768
+ )
769
+
770
+ # 26. length_of_text
771
+ def extract_text_length(s):
772
+ # Create a copy to avoid modifying the original soup
773
+ soup_copy = BeautifulSoup(str(s), 'html.parser')
774
+ for script_or_style in soup_copy(['script', 'style']):
775
+ script_or_style.decompose()
776
+ body = soup_copy.find('body')
777
+ if body:
778
+ text = body.get_text()
779
+ return len(text)
780
+ return 0
781
 
782
+ features['length_of_text'] = extract_feature_with_retry(
783
+ soup, 'length_of_text',
784
+ extract_text_length
785
+ )
786
+
787
+ # 27. number_of_clickable_button
788
+ def extract_clickable_buttons(s):
789
+ buttons = len(s.find_all('button'))
790
+ input_buttons = len(s.find_all('input', {'type': ['button', 'submit', 'reset']}))
791
+ return buttons + input_buttons
792
+
793
+ features['number_of_clickable_button'] = extract_feature_with_retry(
794
+ soup, 'number_of_clickable_button',
795
+ extract_clickable_buttons
796
+ )
797
+
798
+ # 28. number_of_a
799
+ features['number_of_a'] = extract_feature_with_retry(
800
+ soup, 'number_of_a',
801
+ lambda s: len(s.find_all('a'))
802
+ )
803
+
804
+ # 29. number_of_img
805
+ features['number_of_img'] = extract_feature_with_retry(
806
+ soup, 'number_of_img',
807
+ lambda s: len(s.find_all('img'))
808
+ )
809
+
810
+ # 30. number_of_div
811
+ features['number_of_div'] = extract_feature_with_retry(
812
+ soup, 'number_of_div',
813
+ lambda s: len(s.find_all('div'))
814
+ )
815
+
816
+ # 31. number_of_figure
817
+ features['number_of_figure'] = extract_feature_with_retry(
818
+ soup, 'number_of_figure',
819
+ lambda s: len(s.find_all('figure'))
820
+ )
821
+
822
+ # 32. has_footer
823
+ features['has_footer'] = extract_feature_with_retry(
824
+ soup, 'has_footer',
825
+ lambda s: 1 if s.find('footer') else 0
826
+ )
827
+
828
+ # 33. has_form
829
+ features['has_form'] = extract_feature_with_retry(
830
+ soup, 'has_form',
831
+ lambda s: 1 if s.find('form') else 0
832
+ )
833
+
834
+ # 34. has_text_area
835
+ features['has_text_area'] = extract_feature_with_retry(
836
+ soup, 'has_text_area',
837
+ lambda s: 1 if s.find('textarea') else 0
838
+ )
839
+
840
+ # 35. has_iframe
841
+ features['has_iframe'] = extract_feature_with_retry(
842
+ soup, 'has_iframe',
843
+ lambda s: 1 if s.find('iframe') else 0
844
+ )
845
+
846
+ # 36. has_text_input
847
+ features['has_text_input'] = extract_feature_with_retry(
848
+ soup, 'has_text_input',
849
+ lambda s: 1 if s.find('input', {'type': 'text'}) else 0
850
+ )
851
+
852
+ # 37. number_of_meta
853
+ features['number_of_meta'] = extract_feature_with_retry(
854
+ soup, 'number_of_meta',
855
+ lambda s: len(s.find_all('meta'))
856
+ )
857
+
858
+ # 38. has_nav
859
+ features['has_nav'] = extract_feature_with_retry(
860
+ soup, 'has_nav',
861
+ lambda s: 1 if s.find('nav') else 0
862
+ )
863
+
864
+ # 39. has_object
865
+ features['has_object'] = extract_feature_with_retry(
866
+ soup, 'has_object',
867
+ lambda s: 1 if s.find('object') else 0
868
+ )
869
+
870
+ # 40. has_picture
871
+ features['has_picture'] = extract_feature_with_retry(
872
+ soup, 'has_picture',
873
+ lambda s: 1 if s.find('picture') else 0
874
+ )
875
+
876
+ # 41. number_of_sources
877
+ features['number_of_sources'] = extract_feature_with_retry(
878
+ soup, 'number_of_sources',
879
+ lambda s: len(s.find_all('source'))
880
+ )
881
+
882
+ # 42. number_of_span
883
+ features['number_of_span'] = extract_feature_with_retry(
884
+ soup, 'number_of_span',
885
+ lambda s: len(s.find_all('span'))
886
+ )
887
+
888
+ # 43. number_of_table
889
+ features['number_of_table'] = extract_feature_with_retry(
890
+ soup, 'number_of_table',
891
+ lambda s: len(s.find_all('table'))
892
+ )
893
+
894
+ # Clean up Playwright resources if they were created
895
+ if playwright_resources is not None:
896
+ try:
897
+ playwright_instance, browser, context, page = playwright_resources
898
+ if page:
899
+ page.close()
900
+ if context:
901
+ context.close()
902
+ if browser:
903
+ browser.close()
904
+ if playwright_instance:
905
+ playwright_instance.stop()
906
+ logger.debug("Playwright resources closed successfully")
907
+ except Exception as e:
908
+ logger.debug(f"Error closing Playwright resources: {e}")
909
+
910
+ # Count successfully extracted features
911
+ # Features with value >= 0 are successfully extracted, -1 indicates failure
912
+ successful_features = sum(1 for v in features.values() if isinstance(v, int) and v >= 0)
913
+ failed_features = sum(1 for v in features.values() if v == -1)
914
+
915
+ if failed_features > 0:
916
+ logger.warning(f"⚠ Extracted {successful_features}/43 features from {processed_url} ({failed_features} failed)")
917
+ else:
918
+ logger.info(f"✓ Successfully extracted all 43 features from {processed_url}")
919
 
920
+ return features