rb1337 commited on
Commit
2cc7f91
·
verified ·
1 Parent(s): 539c8ef

Upload 50 files

Browse files
Files changed (50) hide show
  1. README.md +115 -12
  2. requirements.txt +43 -43
  3. scripts/__pycache__/extract_combined_features.cpython-313.pyc +0 -0
  4. scripts/data_collection/crawl_tranco_subpages.py +199 -0
  5. scripts/data_collection/download_html.py +637 -0
  6. scripts/data_collection/download_legitimate_html.py +286 -0
  7. scripts/feature_extraction/__pycache__/extract_combined_features.cpython-313.pyc +0 -0
  8. scripts/feature_extraction/__pycache__/html_features.cpython-313.pyc +0 -0
  9. scripts/feature_extraction/__pycache__/url_features.cpython-313.pyc +0 -0
  10. scripts/feature_extraction/__pycache__/url_features_optimized.cpython-313.pyc +0 -0
  11. scripts/feature_extraction/__pycache__/url_features_v2.cpython-313.pyc +0 -0
  12. scripts/feature_extraction/__pycache__/url_features_v3.cpython-313.pyc +0 -0
  13. scripts/feature_extraction/extract_combined_features.py +347 -0
  14. scripts/feature_extraction/html/__pycache__/feature_engineering.cpython-313.pyc +0 -0
  15. scripts/feature_extraction/html/__pycache__/html_feature_extractor.cpython-313.pyc +0 -0
  16. scripts/feature_extraction/html/extract_features.py +322 -0
  17. scripts/feature_extraction/html/feature_engineering.py +127 -0
  18. scripts/feature_extraction/html/html_feature_extractor.py +510 -0
  19. scripts/feature_extraction/html/v1/__pycache__/html_features.cpython-313.pyc +0 -0
  20. scripts/feature_extraction/html/v1/extract_html_features_simple.py +305 -0
  21. scripts/feature_extraction/html/v1/html_features.py +382 -0
  22. scripts/feature_extraction/url/__pycache__/url_features_v3.cpython-313.pyc +0 -0
  23. scripts/feature_extraction/url/url_features_diagnostic.py +51 -0
  24. scripts/feature_extraction/url/url_features_v1.py +626 -0
  25. scripts/feature_extraction/url/url_features_v2.py +1396 -0
  26. scripts/feature_extraction/url/url_features_v3.py +866 -0
  27. scripts/phishing_analysis/analysis.py +144 -0
  28. scripts/phishing_analysis/phishing_analysis.py +85 -0
  29. scripts/phishing_analysis/phishing_type_analysis.csv +0 -0
  30. scripts/predict_combined.py +274 -0
  31. scripts/predict_html.py +303 -0
  32. scripts/predict_url.py +367 -0
  33. scripts/predict_url_cnn.py +332 -0
  34. scripts/testing/data_leakage_test.py +291 -0
  35. scripts/testing/test_feature_alignment.py +123 -0
  36. scripts/testing/test_normalization.py +105 -0
  37. scripts/testing/test_server.py +255 -0
  38. scripts/utils/analyze_dataset.py +42 -0
  39. scripts/utils/balance_dataset.py +30 -0
  40. scripts/utils/clean_urls.py +49 -0
  41. scripts/utils/merge_datasets.py +19 -0
  42. scripts/utils/remove_duplicates.py +23 -0
  43. server/__pycache__/app.cpython-313.pyc +0 -0
  44. server/app.py +819 -0
  45. server/static/index.html +50 -0
  46. server/static/models.html +1130 -0
  47. server/static/script.js +509 -0
  48. server/static/style.css +1325 -0
  49. start_server.bat +37 -0
  50. start_server.sh +35 -0
README.md CHANGED
@@ -1,12 +1,115 @@
1
- ---
2
- title: Phishing Detection System
3
- emoji: 🚀
4
- colorFrom: pink
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: Phishing-Detection-System
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phishing Detection System
2
+
3
+ Machine learning system for detecting phishing websites using URL features and classical ML algorithms.
4
+
5
+ ## Features
6
+
7
+ - **URL Feature Extraction**: Fast analysis of URL structure, lexical patterns, and security indicators
8
+ - **Multiple ML Models**: Logistic Regression, Random Forest, XGBoost
9
+ - **Interactive Prediction**: Test any URL with trained models
10
+ - **Data Collection**: Scripts for downloading phishing and legitimate URL datasets
11
+
12
+ ## Quick Start
13
+
14
+ ### 1. Clone the Repository
15
+
16
+ ```bash
17
+ git clone <your-repo-url>
18
+ cd src
19
+ ```
20
+
21
+ ### 2. Create Virtual Environment Windows
22
+
23
+ ```powershell
24
+ python -m venv venv
25
+ .\venv\Scripts\Activate.ps1
26
+ ```
27
+
28
+ ### 3. Install Dependencies
29
+
30
+ ```bash
31
+ pip install --upgrade pip
32
+ pip install -r requirements.txt
33
+ ```
34
+
35
+ Or install in development mode:
36
+ ```bash
37
+ pip install -e .
38
+ ```
39
+
40
+ ## Usage
41
+ Windows
42
+ ```bash
43
+ ./start_server.bat
44
+ ```
45
+ Linux/Mac
46
+ ```bash
47
+ chmod +x start_server.sh
48
+ ./start_server.sh
49
+ ```
50
+
51
+ Start ngrok to expose local server:
52
+ ```bash
53
+ ngrok http 8000
54
+ ```
55
+
56
+ ### Extract URL Features
57
+
58
+ ```bash
59
+ python scripts/feature_extraction/url_features.py
60
+ ```
61
+
62
+ ### Extract HTML Features
63
+
64
+ ```bash
65
+ python scripts/extract_html_features_simple.py
66
+ ```
67
+
68
+ ### Train URL Models (not required, files are pre-trained)
69
+
70
+ **Logistic Regression (Baseline):**
71
+ ```bash
72
+ python models/baseline/logistic_regression.py
73
+ ```
74
+
75
+ **Random Forest:**
76
+ ```bash
77
+ python models/classical/random_forest.py
78
+ ```
79
+
80
+ **XGBoost:**
81
+ ```bash
82
+ python models/classical/xgboost.py
83
+ ```
84
+
85
+ ### Train HTML Models
86
+
87
+ **XGBoost:**
88
+ ```bash
89
+ python models/html_enhanced/xgboost_html.py
90
+ ```
91
+
92
+ **Random Forest:**
93
+ ```bash
94
+ python models/html_enhanced/random_forest_html_optimalized.py
95
+ ```
96
+
97
+ ### Predict URLs with Trained Models
98
+
99
+ ```bash
100
+ ./start_server.bat
101
+ ```
102
+
103
+ ## Models Performance
104
+
105
+ Results are saved in `results/reports/`
106
+
107
+ ## Dataset Sources
108
+
109
+ - **PhishTank**: Verified phishing URLs database
110
+ - **Majestic Million**: Top 1M websites (legitimate)
111
+ - **Kaggle**: Phishing datasets
112
+
113
+ ## Author
114
+
115
+ Robert Smrek
requirements.txt CHANGED
@@ -1,43 +1,43 @@
1
- # Core Data Science Libraries
2
- numpy>=1.24.0
3
- pandas>=2.0.0
4
- scipy>=1.10.0
5
-
6
- # Machine Learning
7
- scikit-learn>=1.3.0
8
- xgboost>=2.0.0
9
- optuna
10
- tensorflow
11
-
12
- # Web Scraping & URL Analysis
13
- beautifulsoup4>=4.12.0
14
- lxml>=4.9.0
15
- requests>=2.31.0
16
- urllib3>=2.0.0
17
- tldextract>=3.4.0
18
-
19
- # Data Visualization
20
- matplotlib>=3.7.0
21
- seaborn>=0.12.0
22
-
23
- # Progress & Utilities
24
- tqdm>=4.65.0
25
- joblib>=1.3.0
26
- colorama>=0.4.6
27
-
28
- # Jupyter & Notebooks (optional)
29
- jupyter>=1.0.0
30
- ipykernel>=6.23.0
31
- notebook>=6.5.0
32
-
33
- # Testing (optional)
34
- pytest>=7.4.0
35
- pytest-cov>=4.1.0
36
-
37
- # Web Framework
38
- fastapi==0.109.0
39
- uvicorn[standard]==0.27.0
40
- python-multipart==0.0.6
41
-
42
- # CORS
43
- python-dotenv==1.0.0
 
1
+ # Core Data Science Libraries
2
+ numpy>=1.24.0
3
+ pandas>=2.0.0
4
+ scipy>=1.10.0
5
+
6
+ # Machine Learning
7
+ scikit-learn>=1.3.0
8
+ xgboost>=2.0.0
9
+ optuna
10
+ tensorflow
11
+
12
+ # Web Scraping & URL Analysis
13
+ beautifulsoup4>=4.12.0
14
+ lxml>=4.9.0
15
+ requests>=2.31.0
16
+ urllib3>=2.0.0
17
+ tldextract>=3.4.0
18
+
19
+ # Data Visualization
20
+ matplotlib>=3.7.0
21
+ seaborn>=0.12.0
22
+
23
+ # Progress & Utilities
24
+ tqdm>=4.65.0
25
+ joblib>=1.3.0
26
+ colorama>=0.4.6
27
+
28
+ # Jupyter & Notebooks (optional)
29
+ jupyter>=1.0.0
30
+ ipykernel>=6.23.0
31
+ notebook>=6.5.0
32
+
33
+ # Testing (optional)
34
+ pytest>=7.4.0
35
+ pytest-cov>=4.1.0
36
+
37
+ # Web Framework
38
+ fastapi==0.109.0
39
+ uvicorn[standard]==0.27.0
40
+ python-multipart==0.0.6
41
+
42
+ # CORS
43
+ python-dotenv==1.0.0
scripts/__pycache__/extract_combined_features.cpython-313.pyc ADDED
Binary file (17 kB). View file
 
scripts/data_collection/crawl_tranco_subpages.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to crawl subpages from Tranco URLs:
4
+ - Reads URLs from tranco_processed.csv
5
+ - Crawls each domain to find up to 10 subpages
6
+ - Creates new dataset with subpage URLs and label 0
7
+ """
8
+
9
+ import pandas as pd
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ from urllib.parse import urljoin, urlparse
13
+ import time
14
+ import os
15
+ from tqdm import tqdm
16
+ import logging
17
+ from concurrent.futures import ThreadPoolExecutor, as_completed
18
+ import threading
19
+
20
+ # Setup logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - %(message)s'
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ def get_domain(url):
28
+ """Extract domain from URL"""
29
+ parsed = urlparse(url)
30
+ return f"{parsed.scheme}://{parsed.netloc}"
31
+
32
+ def is_same_domain(url, base_url):
33
+ """Check if URL belongs to the same domain as base_url"""
34
+ return urlparse(url).netloc == urlparse(base_url).netloc
35
+
36
+ def crawl_subpages(base_url, max_subpages=10, timeout=10):
37
+ """
38
+ Crawl a website to find subpages
39
+
40
+ Args:
41
+ base_url: Base URL to crawl
42
+ max_subpages: Maximum number of subpages to collect
43
+ timeout: Request timeout in seconds
44
+
45
+ Returns:
46
+ List of subpage URLs
47
+ """
48
+ subpages = set()
49
+ headers = {
50
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
51
+ }
52
+
53
+ try:
54
+ # Get the main page
55
+ response = requests.get(base_url, headers=headers, timeout=timeout, allow_redirects=True)
56
+ response.raise_for_status()
57
+
58
+ # Parse HTML
59
+ soup = BeautifulSoup(response.content, 'html.parser')
60
+
61
+ # Find all links
62
+ links = soup.find_all('a', href=True)
63
+
64
+ for link in links:
65
+ if len(subpages) >= max_subpages:
66
+ break
67
+
68
+ href = link['href']
69
+
70
+ # Convert relative URLs to absolute
71
+ full_url = urljoin(base_url, str(href))
72
+
73
+ # Only include URLs from the same domain
74
+ if is_same_domain(full_url, base_url):
75
+ # Remove fragments
76
+ parsed = urlparse(full_url)
77
+ clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
78
+ if parsed.query:
79
+ clean_url += f"?{parsed.query}"
80
+
81
+ # Avoid duplicates and the base URL itself
82
+ if clean_url != base_url and clean_url not in subpages:
83
+ subpages.add(clean_url)
84
+
85
+ return list(subpages)[:max_subpages]
86
+
87
+ except requests.exceptions.Timeout:
88
+ logger.warning(f"Timeout while crawling {base_url}")
89
+ return []
90
+ except requests.exceptions.RequestException as e:
91
+ logger.warning(f"Error crawling {base_url}: {str(e)}")
92
+ return []
93
+ except Exception as e:
94
+ logger.warning(f"Unexpected error crawling {base_url}: {str(e)}")
95
+ return []
96
+
97
+ def crawl_dataset(input_file, output_file, max_subpages_per_url=10, max_urls=None, delay=1, num_threads=10):
98
+ """
99
+ Crawl all URLs in dataset to find subpages
100
+
101
+ Args:
102
+ input_file: Path to input CSV file
103
+ output_file: Path to output CSV file
104
+ max_subpages_per_url: Maximum subpages to collect per URL
105
+ max_urls: Maximum number of URLs to process (None for all)
106
+ delay: Delay between requests in seconds
107
+ num_threads: Number of concurrent threads for crawling
108
+ """
109
+ # Read input file
110
+ logger.info(f"Reading {input_file}...")
111
+ df = pd.read_csv(input_file)
112
+
113
+ if max_urls:
114
+ df = df.head(max_urls)
115
+ logger.info(f"Processing first {max_urls} URLs")
116
+
117
+ logger.info(f"Dataset contains {len(df)} URLs")
118
+ logger.info(f"Using {num_threads} threads for concurrent crawling")
119
+
120
+ # Collect all subpages
121
+ all_subpages = []
122
+ lock = threading.Lock()
123
+
124
+ def process_url(row):
125
+ """Process a single URL with delay"""
126
+ base_url = row['url']
127
+ logger.info(f"Crawling {base_url}...")
128
+
129
+ subpages = crawl_subpages(base_url, max_subpages=max_subpages_per_url)
130
+
131
+ results = []
132
+ if subpages:
133
+ logger.info(f"Found {len(subpages)} subpages for {base_url}")
134
+ for subpage in subpages:
135
+ results.append({
136
+ 'url': subpage,
137
+ 'label': 0, # Legitimate
138
+ # 'source_url': base_url
139
+ })
140
+ else:
141
+ logger.warning(f"No subpages found for {base_url}")
142
+
143
+ # Delay to be respectful to servers
144
+ time.sleep(delay)
145
+ return results
146
+
147
+ # Use ThreadPoolExecutor for concurrent crawling
148
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
149
+ # Submit all tasks
150
+ future_to_url = {executor.submit(process_url, row): row['url']
151
+ for _, row in df.iterrows()}
152
+
153
+ # Process completed tasks with progress bar
154
+ with tqdm(total=len(df), desc="Crawling URLs") as pbar:
155
+ for future in as_completed(future_to_url):
156
+ try:
157
+ results = future.result()
158
+ with lock:
159
+ all_subpages.extend(results)
160
+ except Exception as e:
161
+ url = future_to_url[future]
162
+ logger.error(f"Error processing {url}: {str(e)}")
163
+ finally:
164
+ pbar.update(1)
165
+
166
+ # Create DataFrame with all subpages
167
+ result_df = pd.DataFrame(all_subpages)
168
+
169
+ logger.info(f"\nTotal subpages collected: {len(result_df)}")
170
+ logger.info(f"Saving to {output_file}...")
171
+
172
+ # Save to CSV
173
+ result_df.to_csv(output_file, index=False)
174
+
175
+ logger.info("Crawling complete!")
176
+ logger.info(f"\nFirst few rows:\n{result_df.head(10)}")
177
+ logger.info(f"\nDataset statistics:")
178
+ logger.info(f"Total URLs: {len(result_df)}")
179
+ logger.info(f"Unique source domains: {result_df['source_url'].nunique()}")
180
+
181
+ return result_df
182
+
183
+ if __name__ == "__main__":
184
+ # Define paths
185
+ script_dir = os.path.dirname(os.path.abspath(__file__))
186
+ project_root = os.path.dirname(script_dir)
187
+ input_file = os.path.join(project_root, 'data', 'raw', 'tranco_processed2.csv')
188
+ output_file = os.path.join(project_root, 'data', 'raw', 'tranco_subpages2.csv')
189
+
190
+ # Crawl dataset
191
+ # Process first 100 URLs for testing (remove max_urls=100 to process all)
192
+ crawl_dataset(
193
+ input_file=input_file,
194
+ output_file=output_file,
195
+ max_subpages_per_url=10,
196
+ # max_urls=100,
197
+ delay=1,
198
+ num_threads=10 # Adjust based on your needs (10-20 is usually good)
199
+ )
scripts/data_collection/download_html.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Download HTML Content from Verified Online Phishing URLs
3
+
4
+ This script downloads HTML content from phishing URLs that are verified and online.
5
+ Saves HTML files for later feature extraction.
6
+ """
7
+
8
+ import pandas as pd
9
+ import requests
10
+ from requests.adapters import HTTPAdapter
11
+ from urllib3.util.retry import Retry
12
+ from pathlib import Path
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from tqdm import tqdm
15
+ import time
16
+ import hashlib
17
+ import logging
18
+ from datetime import datetime
19
+ from bs4 import BeautifulSoup
20
+ import re
21
+ import urllib3
22
+ import random
23
+ from collections import defaultdict
24
+ from threading import Lock
25
+ import json
26
+
27
+ # Disable SSL warnings (expected when downloading phishing sites with invalid certificates)
28
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
29
+
30
+ # Setup logging
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format='%(asctime)s - %(levelname)s - %(message)s',
34
+ datefmt='%H:%M:%S'
35
+ )
36
+ logger = logging.getLogger("html_downloader")
37
+
38
+
39
+ class HTMLDownloader:
40
+ """Optimized HTML downloader with retry, checkpointing, and rate limiting."""
41
+
42
+ def __init__(self, output_dir='data/html', max_workers=20, timeout=8, checkpoint_interval=100):
43
+ """
44
+ Initialize optimized HTML downloader.
45
+
46
+ Args:
47
+ output_dir: Base directory to save HTML files
48
+ max_workers: Number of parallel download threads (increased to 20)
49
+ timeout: Request timeout in seconds (reduced to 8s for faster failure)
50
+ checkpoint_interval: Save progress every N URLs
51
+ """
52
+ self.output_dir = Path(output_dir)
53
+ self.legit_dir = self.output_dir / 'legitimate'
54
+ self.phishing_dir = self.output_dir / 'phishing'
55
+ self.legit_dir.mkdir(parents=True, exist_ok=True)
56
+ self.phishing_dir.mkdir(parents=True, exist_ok=True)
57
+ self.max_workers = max_workers
58
+ self.timeout = timeout
59
+ self.checkpoint_interval = checkpoint_interval
60
+
61
+ # Stats
62
+ self.stats = {
63
+ 'total': 0,
64
+ 'success': 0,
65
+ 'failed': 0,
66
+ 'timeout': 0,
67
+ 'error': 0,
68
+ 'retried': 0,
69
+ 'http_fallback': 0
70
+ }
71
+
72
+ # User agents rotation (avoid blocks)
73
+ self.user_agents = [
74
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
75
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
76
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
77
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
78
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0',
79
+ ]
80
+
81
+ # Domain rate limiting (delay per domain)
82
+ self.domain_last_access = defaultdict(float)
83
+ self.domain_lock = Lock()
84
+ self.min_domain_delay = 0.5 # 500ms between requests to same domain
85
+
86
+ # Session pool for connection reuse
87
+ self.sessions = []
88
+ for _ in range(max_workers):
89
+ session = self._create_session()
90
+ self.sessions.append(session)
91
+
92
+ # Checkpoint file
93
+ self.checkpoint_file = self.output_dir / 'download_checkpoint.json'
94
+ self.completed_urls = self._load_checkpoint()
95
+
96
+ def _create_session(self):
97
+ """Create optimized requests session with retry and compression."""
98
+ session = requests.Session()
99
+
100
+ # Retry strategy: 3 retries with exponential backoff
101
+ retry_strategy = Retry(
102
+ total=3,
103
+ backoff_factor=0.5, # 0.5s, 1s, 2s
104
+ status_forcelist=[429, 500, 502, 503, 504],
105
+ allowed_methods=["GET", "HEAD"]
106
+ )
107
+
108
+ adapter = HTTPAdapter(
109
+ max_retries=retry_strategy,
110
+ pool_connections=100,
111
+ pool_maxsize=100,
112
+ pool_block=False
113
+ )
114
+
115
+ session.mount("http://", adapter)
116
+ session.mount("https://", adapter)
117
+
118
+ # Enable compression
119
+ session.headers.update({
120
+ 'Accept-Encoding': 'gzip, deflate, br',
121
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
122
+ 'Accept-Language': 'en-US,en;q=0.9',
123
+ 'Connection': 'keep-alive',
124
+ })
125
+
126
+ return session
127
+
128
+ def _get_random_user_agent(self):
129
+ """Get random user agent to avoid detection."""
130
+ return random.choice(self.user_agents)
131
+
132
+ def _load_checkpoint(self):
133
+ """Load checkpoint of already downloaded URLs."""
134
+ if self.checkpoint_file.exists():
135
+ try:
136
+ with open(self.checkpoint_file, 'r') as f:
137
+ data = json.load(f)
138
+ completed = set(data.get('completed_urls', []))
139
+ logger.info(f"Loaded checkpoint: {len(completed):,} URLs already downloaded")
140
+ return completed
141
+ except Exception as e:
142
+ logger.warning(f"Failed to load checkpoint: {e}")
143
+ return set()
144
+
145
+ def _save_checkpoint(self, results):
146
+ """Save checkpoint of completed URLs."""
147
+ try:
148
+ completed = [r['url'] for r in results if r['status'] == 'success']
149
+ self.completed_urls.update(completed)
150
+
151
+ with open(self.checkpoint_file, 'w') as f:
152
+ json.dump({
153
+ 'completed_urls': list(self.completed_urls),
154
+ 'timestamp': datetime.now().isoformat(),
155
+ 'total_completed': len(self.completed_urls)
156
+ }, f)
157
+ except Exception as e:
158
+ logger.warning(f"Failed to save checkpoint: {e}")
159
+
160
+ def _rate_limit_domain(self, url):
161
+ """Apply per-domain rate limiting."""
162
+ try:
163
+ from urllib.parse import urlparse
164
+ domain = urlparse(url).netloc
165
+
166
+ with self.domain_lock:
167
+ last_access = self.domain_last_access[domain]
168
+ now = time.time()
169
+ time_since_last = now - last_access
170
+
171
+ if time_since_last < self.min_domain_delay:
172
+ sleep_time = self.min_domain_delay - time_since_last
173
+ time.sleep(sleep_time)
174
+
175
+ self.domain_last_access[domain] = time.time()
176
+ except:
177
+ pass # If rate limiting fails, continue anyway
178
+
179
+ def _url_to_filename(self, url):
180
+ """Convert URL to safe filename using hash."""
181
+ url_hash = hashlib.md5(url.encode()).hexdigest()
182
+ return f"{url_hash}.html"
183
+
184
+ def _optimize_html(self, html_content):
185
+ """
186
+ Aggressively optimize HTML for feature extraction.
187
+
188
+ Removes unnecessary content while preserving structure:
189
+ - Comments, excessive whitespace
190
+ - Inline styles (keeps style tags for counting)
191
+ - Large script/style content (keeps tags for counting)
192
+ - Base64 embedded images (huge size, not needed for features)
193
+
194
+ Args:
195
+ html_content: Raw HTML content
196
+
197
+ Returns:
198
+ Optimized HTML string (typically 60-80% smaller)
199
+ """
200
+ try:
201
+ # Quick regex cleanup before parsing (faster than BeautifulSoup for some tasks)
202
+ # Remove HTML comments
203
+ html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
204
+
205
+ # Remove base64 embedded images (can be huge, not needed for features)
206
+ html_content = re.sub(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+', 'data:image', html_content)
207
+
208
+ # Parse HTML (use lxml parser if available, it's faster)
209
+ try:
210
+ soup = BeautifulSoup(html_content, 'lxml')
211
+ except:
212
+ soup = BeautifulSoup(html_content, 'html.parser')
213
+
214
+ # Remove inline styles (but keep style tags for counting)
215
+ for tag in soup.find_all(style=True):
216
+ del tag['style']
217
+
218
+ # Truncate large script/style content (keep tags for counting, trim content)
219
+ for script in soup.find_all('script'):
220
+ if script.string and len(script.string) > 500:
221
+ script.string = script.string[:500] + '...'
222
+
223
+ for style in soup.find_all('style'):
224
+ if style.string and len(style.string) > 500:
225
+ style.string = style.string[:500] + '...'
226
+
227
+ # Normalize whitespace in text nodes
228
+ for text in soup.find_all(string=True):
229
+ if text.parent.name not in ['script', 'style']: # type: ignore
230
+ normalized = re.sub(r'\s+', ' ', str(text).strip())
231
+ if normalized:
232
+ text.replace_with(normalized)
233
+
234
+ # Convert back to string
235
+ optimized = str(soup)
236
+
237
+ # Final cleanup: remove excessive blank lines
238
+ optimized = re.sub(r'\n\s*\n+', '\n', optimized)
239
+
240
+ return optimized
241
+
242
+ except Exception as e:
243
+ logger.warning(f"HTML optimization failed: {e}, returning original")
244
+ # Fallback: at least remove comments and excessive whitespace
245
+ html_content = re.sub(r'<!--.*?-->', '', html_content, flags=re.DOTALL)
246
+ html_content = re.sub(r'\n\s*\n+', '\n', html_content)
247
+ return html_content
248
+
249
+ def download_single_url(self, url, label, url_id=None, session=None):
250
+ """
251
+ Download HTML with retry logic and HTTP fallback.
252
+
253
+ Args:
254
+ url: URL to download
255
+ label: Label (0=legitimate, 1=phishing)
256
+ url_id: Optional ID from dataset
257
+ session: Requests session (for connection pooling)
258
+
259
+ Returns:
260
+ Dictionary with download result
261
+ """
262
+ result = {
263
+ 'url': url,
264
+ 'label': label,
265
+ 'url_id': url_id,
266
+ 'status': 'failed',
267
+ 'error': None,
268
+ 'filename': None,
269
+ 'size': 0,
270
+ 'original_size': 0
271
+ }
272
+
273
+ # Skip if already downloaded
274
+ if url in self.completed_urls:
275
+ result['status'] = 'skipped'
276
+ result['error'] = 'Already downloaded'
277
+ return result
278
+
279
+ # Apply rate limiting
280
+ self._rate_limit_domain(url)
281
+
282
+ # Use provided session or create temporary one
283
+ if session is None:
284
+ session = self._create_session()
285
+
286
+ # Add scheme if missing (default HTTPS)
287
+ original_url = url
288
+ if not url.startswith(('http://', 'https://')):
289
+ url = 'https://' + url
290
+
291
+ attempts = [url]
292
+
293
+ # If HTTPS, also try HTTP as fallback
294
+ if url.startswith('https://'):
295
+ http_url = url.replace('https://', 'http://', 1)
296
+ attempts.append(http_url)
297
+
298
+ # Try each URL variant
299
+ for attempt_num, attempt_url in enumerate(attempts):
300
+ try:
301
+ # Random user agent for each attempt
302
+ headers = {'User-Agent': self._get_random_user_agent()}
303
+
304
+ # Download with timeout and retries (handled by session)
305
+ response = session.get(
306
+ attempt_url,
307
+ headers=headers,
308
+ timeout=(3, self.timeout), # (connect timeout, read timeout)
309
+ allow_redirects=True,
310
+ verify=False, # Phishing sites often have invalid SSL
311
+ stream=False # We need full content
312
+ )
313
+
314
+ # Check if successful
315
+ if response.status_code == 200:
316
+ # Check content type (skip if not HTML)
317
+ content_type = response.headers.get('Content-Type', '')
318
+ if 'text/html' not in content_type.lower() and 'application/xhtml' not in content_type.lower():
319
+ result['status'] = 'failed'
320
+ result['error'] = f'Non-HTML content: {content_type}'
321
+ continue
322
+
323
+ # Get HTML content
324
+ html_content = response.text
325
+ result['original_size'] = len(html_content)
326
+
327
+ # Skip if too small (likely error page)
328
+ if len(html_content) < 200:
329
+ result['status'] = 'failed'
330
+ result['error'] = 'HTML too small (< 200 bytes)'
331
+ continue
332
+
333
+ # Optimize HTML for feature extraction
334
+ optimized_html = self._optimize_html(html_content)
335
+
336
+ # Save to appropriate directory
337
+ filename = self._url_to_filename(original_url)
338
+ target_dir = self.legit_dir if label == 0 else self.phishing_dir
339
+ filepath = target_dir / filename
340
+
341
+ with open(filepath, 'w', encoding='utf-8', errors='ignore') as f:
342
+ f.write(optimized_html)
343
+
344
+ result['status'] = 'success'
345
+ result['filename'] = filename
346
+ result['size'] = len(optimized_html)
347
+ result['target_dir'] = str(target_dir.name)
348
+ result['compression_ratio'] = f"{(1 - len(optimized_html) / max(result['original_size'], 1)) * 100:.1f}%"
349
+
350
+ if attempt_num > 0:
351
+ result['http_fallback'] = True
352
+ self.stats['http_fallback'] += 1
353
+
354
+ self.stats['success'] += 1
355
+ return result # Success!
356
+
357
+ else:
358
+ result['error'] = f"HTTP {response.status_code}"
359
+ if attempt_num == len(attempts) - 1: # Last attempt
360
+ result['status'] = 'failed'
361
+ self.stats['failed'] += 1
362
+
363
+ except requests.Timeout:
364
+ result['error'] = 'Timeout'
365
+ if attempt_num == len(attempts) - 1:
366
+ result['status'] = 'timeout'
367
+ self.stats['timeout'] += 1
368
+
369
+ except requests.RequestException as e:
370
+ result['error'] = f"{type(e).__name__}: {str(e)[:80]}"
371
+ if attempt_num == len(attempts) - 1:
372
+ result['status'] = 'error'
373
+ self.stats['error'] += 1
374
+
375
+ except Exception as e:
376
+ result['error'] = f"Unknown: {str(e)[:80]}"
377
+ if attempt_num == len(attempts) - 1:
378
+ result['status'] = 'error'
379
+ self.stats['error'] += 1
380
+
381
+ return result
382
+
383
+ def download_batch(self, urls_df, label_column='label', id_column=None, resume=True):
384
+ """
385
+ Download HTML with checkpointing and session pooling.
386
+
387
+ Args:
388
+ urls_df: DataFrame with URLs
389
+ label_column: Column name for labels
390
+ id_column: Optional column name for IDs
391
+ resume: Resume from checkpoint if available
392
+
393
+ Returns:
394
+ DataFrame with download results
395
+ """
396
+ self.stats['total'] = len(urls_df)
397
+
398
+ # Filter already downloaded URLs if resuming
399
+ if resume and self.completed_urls:
400
+ url_column = 'url' if 'url' in urls_df.columns else 'URL'
401
+ urls_df = urls_df[~urls_df[url_column].isin(self.completed_urls)].copy()
402
+ skipped = self.stats['total'] - len(urls_df)
403
+ if skipped > 0:
404
+ logger.info(f"Resuming: {skipped:,} URLs already downloaded, {len(urls_df):,} remaining")
405
+
406
+ logger.info(f"Starting optimized download of {len(urls_df):,} URLs...")
407
+ logger.info(f"Workers: {self.max_workers} | Timeout: {self.timeout}s | Checkpoint: every {self.checkpoint_interval} URLs")
408
+ logger.info(f"Output: {self.output_dir.absolute()}")
409
+ logger.info(f"Features: Session pooling, retry logic, HTTP fallback, rate limiting, compression")
410
+
411
+ results = []
412
+ session_idx = 0
413
+
414
+ # Prepare tasks
415
+ tasks = []
416
+ for idx, row in urls_df.iterrows():
417
+ url = row['url'] if 'url' in row else row['URL']
418
+ label = row[label_column] if label_column in row else 1
419
+ url_id = row[id_column] if id_column and id_column in row else idx
420
+ tasks.append((url, label, url_id))
421
+
422
+ # Download in parallel with progress bar and checkpointing
423
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
424
+ # Submit tasks with session pooling
425
+ future_to_task = {}
426
+ for url, label, url_id in tasks:
427
+ # Round-robin session assignment
428
+ session = self.sessions[session_idx % len(self.sessions)]
429
+ session_idx += 1
430
+
431
+ future = executor.submit(self.download_single_url, url, label, url_id, session)
432
+ future_to_task[future] = (url, label, url_id)
433
+
434
+ # Process completed tasks with progress bar
435
+ with tqdm(total=len(tasks), desc="Downloading", unit="url") as pbar:
436
+ checkpoint_counter = 0
437
+
438
+ for future in as_completed(future_to_task):
439
+ result = future.result()
440
+ results.append(result)
441
+ pbar.update(1)
442
+
443
+ checkpoint_counter += 1
444
+
445
+ # Save checkpoint periodically
446
+ if checkpoint_counter >= self.checkpoint_interval:
447
+ self._save_checkpoint(results)
448
+ checkpoint_counter = 0
449
+
450
+ # Update progress bar with detailed stats
451
+ pbar.set_postfix({
452
+ 'OK': self.stats['success'],
453
+ 'Fail': self.stats['failed'],
454
+ 'Timeout': self.stats['timeout'],
455
+ 'HTTP↓': self.stats['http_fallback']
456
+ })
457
+
458
+ # Final checkpoint save
459
+ self._save_checkpoint(results)
460
+
461
+ # Create results DataFrame
462
+ results_df = pd.DataFrame(results)
463
+
464
+ # Print summary
465
+ self._print_summary(results_df)
466
+
467
+ return results_df
468
+
469
+ def _print_summary(self, results_df):
470
+ """Print detailed download summary with optimization metrics."""
471
+ logger.info("\n" + "="*80)
472
+ logger.info("DOWNLOAD SUMMARY")
473
+ logger.info("="*80)
474
+
475
+ total = self.stats['total']
476
+ success = self.stats['success']
477
+
478
+ logger.info(f"\nTotal URLs processed: {total:,}")
479
+ logger.info(f" ✓ Successful: {success:,} ({success/max(total,1)*100:.1f}%)")
480
+ logger.info(f" ✗ Failed: {self.stats['failed']:,}")
481
+ logger.info(f" ⏱ Timeout: {self.stats['timeout']:,}")
482
+ logger.info(f" ⚠ Error: {self.stats['error']:,}")
483
+ logger.info(f" ↓ HTTP Fallback: {self.stats['http_fallback']:,}")
484
+
485
+ # Detailed stats if we have results
486
+ if not results_df.empty and 'status' in results_df.columns:
487
+ # Success by label
488
+ if 'label' in results_df.columns:
489
+ success_by_label = results_df[results_df['status'] == 'success'].groupby('label').size()
490
+ if not success_by_label.empty:
491
+ logger.info(f"\nSuccessful downloads by type:")
492
+ for label, count in success_by_label.items():
493
+ label_name = 'Phishing' if label == 1 else 'Legitimate'
494
+ logger.info(f" {label_name}: {count:,}")
495
+
496
+ # Size statistics
497
+ successful = results_df[results_df['status'] == 'success']
498
+ if not successful.empty and 'size' in successful.columns:
499
+ total_optimized = successful['size'].sum()
500
+ total_original = successful.get('original_size', successful['size']).sum()
501
+
502
+ logger.info(f"\nStorage statistics:")
503
+ logger.info(f" Original size: {total_original/1024/1024:.2f} MB")
504
+ logger.info(f" Optimized size: {total_optimized/1024/1024:.2f} MB")
505
+ if total_original > 0:
506
+ saved = (1 - total_optimized / total_original) * 100
507
+ logger.info(f" Space saved: {saved:.1f}%")
508
+
509
+ # Error breakdown
510
+ failed = results_df[results_df['status'] != 'success']
511
+ if not failed.empty and 'error' in failed.columns:
512
+ error_counts = failed['error'].value_counts().head(5)
513
+ if not error_counts.empty:
514
+ logger.info(f"\nTop failure reasons:")
515
+ for error, count in error_counts.items():
516
+ logger.info(f" {error}: {count:,}")
517
+
518
+ logger.info("="*80)
519
+
520
+
521
+ def main():
522
+ """Main function to download HTML from verified online phishing URLs."""
523
+ import argparse
524
+
525
+ parser = argparse.ArgumentParser(description='Download HTML content from URLs and organize by label')
526
+ parser.add_argument('--input', type=str, default='data/processed/clean_dataset.csv',
527
+ help='Input CSV file with URLs (must have url,label,type columns)')
528
+ parser.add_argument('--output', type=str, default='data/html',
529
+ help='Base output directory (will create legitimate/ and phishing/ subdirectories)')
530
+ parser.add_argument('--workers', type=int, default=20,
531
+ help='Number of parallel download workers (default: 20)')
532
+ parser.add_argument('--timeout', type=int, default=8,
533
+ help='Request timeout in seconds (default: 8s)')
534
+ parser.add_argument('--checkpoint', type=int, default=100,
535
+ help='Save progress every N URLs (default: 100)')
536
+ parser.add_argument('--resume', action='store_true', default=True,
537
+ help='Resume from checkpoint (default: True)')
538
+ parser.add_argument('--no-resume', dest='resume', action='store_false',
539
+ help='Start fresh, ignore checkpoint')
540
+ parser.add_argument('--limit', type=int, default=None,
541
+ help='Limit number of URLs to download (for testing)')
542
+ parser.add_argument('--balance', action='store_true',
543
+ help='Download equal number of legitimate and phishing URLs')
544
+
545
+ args = parser.parse_args()
546
+
547
+ logger.info("="*80)
548
+ logger.info("HTML CONTENT DOWNLOADER - Phishing Detection")
549
+ logger.info("="*80)
550
+
551
+ # Load URLs
552
+ script_dir = Path(__file__).parent.parent.parent
553
+ input_path = (script_dir / args.input).resolve()
554
+
555
+ logger.info(f"\nLoading URLs from: {input_path}")
556
+ df = pd.read_csv(input_path)
557
+ logger.info(f"Loaded: {len(df):,} URLs")
558
+
559
+ # Show columns
560
+ logger.info(f"Columns: {list(df.columns)}")
561
+
562
+ # Verify required columns
563
+ if 'url' not in df.columns and 'URL' not in df.columns:
564
+ logger.error("No 'url' or 'URL' column found in dataset!")
565
+ return
566
+
567
+ if 'label' not in df.columns:
568
+ logger.error("No 'label' column found in dataset!")
569
+ return
570
+
571
+ # Show label distribution
572
+ logger.info(f"\nLabel distribution in dataset:")
573
+ label_counts = df['label'].value_counts()
574
+ for label, count in label_counts.items():
575
+ label_name = 'Legitimate' if label == 0 else 'Phishing'
576
+ logger.info(f" {label_name} (label={label}): {count:,}")
577
+
578
+ # Balance dataset if requested
579
+ if args.balance:
580
+ min_count = label_counts.min()
581
+ df_balanced = pd.concat([
582
+ df[df['label'] == 0].sample(n=min(min_count, len(df[df['label'] == 0])), random_state=42),
583
+ df[df['label'] == 1].sample(n=min(min_count, len(df[df['label'] == 1])), random_state=42)
584
+ ]).sample(frac=1, random_state=42).reset_index(drop=True)
585
+ df = df_balanced
586
+ logger.info(f"\nBalanced dataset to {min_count:,} samples per class")
587
+ logger.info(f"Total URLs after balancing: {len(df):,}")
588
+
589
+ # Limit for testing
590
+ if args.limit:
591
+ df = df.head(args.limit)
592
+ logger.info(f"Limited to first {args.limit:,} URLs for testing")
593
+
594
+ # Initialize optimized downloader
595
+ output_dir = (script_dir / args.output).resolve()
596
+ downloader = HTMLDownloader(
597
+ output_dir=output_dir,
598
+ max_workers=args.workers,
599
+ timeout=args.timeout,
600
+ checkpoint_interval=args.checkpoint
601
+ )
602
+
603
+ # Download HTML content with checkpointing
604
+ results_df = downloader.download_batch(
605
+ df,
606
+ label_column='label' if 'label' in df.columns else None, # type: ignore
607
+ id_column='phish_id' if 'phish_id' in df.columns else None, # type: ignore
608
+ resume=args.resume
609
+ )
610
+
611
+ # Save results
612
+ results_file = output_dir / f'download_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
613
+ results_df.to_csv(results_file, index=False)
614
+ logger.info(f"\n✓ Results saved to: {results_file}")
615
+
616
+ # Save metadata mapping (URL to filename)
617
+ metadata = results_df[results_df['status'] == 'success'][['url', 'label', 'filename', 'url_id']]
618
+ metadata_file = output_dir / 'html_metadata.csv'
619
+ metadata.to_csv(metadata_file, index=False)
620
+ logger.info(f"✓ Metadata saved to: {metadata_file}")
621
+
622
+ logger.info("\n" + "="*80)
623
+ logger.info("✓ HTML DOWNLOAD COMPLETE!")
624
+ logger.info("="*80)
625
+ logger.info(f"\nFiles saved to:")
626
+ logger.info(f" Legitimate: {output_dir / 'legitimate'}")
627
+ logger.info(f" Phishing: {output_dir / 'phishing'}")
628
+ logger.info(f"\nHTML files have been optimized for feature extraction:")
629
+ logger.info(f" - Comments removed")
630
+ logger.info(f" - Whitespace normalized")
631
+ logger.info(f" - Inline styles removed")
632
+ logger.info(f" - Structure preserved for feature extraction")
633
+ logger.info("="*80)
634
+
635
+
636
+ if __name__ == "__main__":
637
+ main()
scripts/data_collection/download_legitimate_html.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Download HTML Content from Legitimate URLs
3
+ Downloads HTML from top-1m legitimate URLs for training
4
+ """
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ import requests
8
+ import hashlib
9
+ import logging
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from tqdm import tqdm
12
+ from datetime import datetime
13
+ import warnings
14
+
15
+ # Disable SSL warnings
16
+ warnings.filterwarnings('ignore', message='Unverified HTTPS request')
17
+
18
+ # Setup logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(levelname)s - %(message)s',
22
+ datefmt='%H:%M:%S'
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class LegitimateHTMLDownloader:
28
+ """Download HTML content from legitimate URLs."""
29
+
30
+ def __init__(self, output_dir='data/html_legitimate', max_workers=10, timeout=10):
31
+ self.output_dir = Path(output_dir)
32
+ self.output_dir.mkdir(parents=True, exist_ok=True)
33
+ self.max_workers = max_workers
34
+ self.timeout = timeout
35
+
36
+ # Stats
37
+ self.stats = {
38
+ 'total': 0,
39
+ 'success': 0,
40
+ 'failed': 0,
41
+ 'timeout': 0,
42
+ 'error': 0
43
+ }
44
+
45
+ # Headers to mimic browser
46
+ self.headers = {
47
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
48
+ }
49
+
50
+ def _url_to_filename(self, url):
51
+ """Convert URL to safe filename using MD5 hash."""
52
+ url_hash = hashlib.md5(url.encode()).hexdigest()
53
+ return f"{url_hash}.html"
54
+
55
+ def download_single_url(self, url, url_id=None):
56
+ """
57
+ Download HTML from a single URL.
58
+
59
+ Args:
60
+ url: URL to download
61
+ url_id: Optional ID from dataset
62
+
63
+ Returns:
64
+ Dictionary with download result
65
+ """
66
+ result = {
67
+ 'url': url,
68
+ 'url_id': url_id,
69
+ 'status': 'failed',
70
+ 'filename': None,
71
+ 'size': 0,
72
+ 'error': None
73
+ }
74
+
75
+ self.stats['total'] += 1
76
+
77
+ try:
78
+ # Add https:// if missing
79
+ if not url.startswith(('http://', 'https://')):
80
+ url = 'https://' + url
81
+
82
+ # Download with timeout
83
+ response = requests.get(
84
+ url,
85
+ headers=self.headers,
86
+ timeout=self.timeout,
87
+ allow_redirects=True,
88
+ verify=False
89
+ )
90
+
91
+ # Check if successful
92
+ if response.status_code == 200:
93
+ # Save HTML content
94
+ filename = self._url_to_filename(url)
95
+ filepath = self.output_dir / filename
96
+
97
+ with open(filepath, 'w', encoding='utf-8', errors='ignore') as f:
98
+ f.write(response.text)
99
+
100
+ result['status'] = 'success'
101
+ result['filename'] = filename
102
+ result['size'] = len(response.text)
103
+ self.stats['success'] += 1
104
+
105
+ else:
106
+ result['status'] = 'failed'
107
+ result['error'] = f"HTTP {response.status_code}"
108
+ self.stats['failed'] += 1
109
+
110
+ except requests.Timeout:
111
+ result['status'] = 'timeout'
112
+ result['error'] = 'Timeout'
113
+ self.stats['timeout'] += 1
114
+
115
+ except requests.RequestException as e:
116
+ result['status'] = 'error'
117
+ result['error'] = f"Request error: {str(e)[:100]}"
118
+ self.stats['error'] += 1
119
+
120
+ except Exception as e:
121
+ result['status'] = 'error'
122
+ result['error'] = f"Unknown error: {str(e)[:100]}"
123
+ self.stats['error'] += 1
124
+
125
+ return result
126
+
127
+ def download_batch(self, urls_df, id_column=None):
128
+ """
129
+ Download HTML content from multiple URLs in parallel.
130
+
131
+ Args:
132
+ urls_df: DataFrame with URLs
133
+ id_column: Optional column name for ID
134
+
135
+ Returns:
136
+ DataFrame with results
137
+ """
138
+ logger.info(f"Starting download of {len(urls_df):,} URLs...")
139
+ logger.info(f"Using {self.max_workers} parallel workers")
140
+ logger.info(f"Timeout: {self.timeout}s per URL")
141
+ logger.info(f"Output directory: {self.output_dir.absolute()}")
142
+
143
+ results = []
144
+
145
+ # Prepare tasks
146
+ tasks = []
147
+ for idx, row in urls_df.iterrows():
148
+ # Handle different column names
149
+ if 'URL' in row:
150
+ url = row['URL']
151
+ elif 'url' in row:
152
+ url = row['url']
153
+ elif 'domain' in row:
154
+ url = row['domain']
155
+ else:
156
+ # Assume second column is URL/domain
157
+ url = row.iloc[1]
158
+
159
+ url_id = row[id_column] if id_column and id_column in row else idx
160
+ tasks.append((url, url_id))
161
+
162
+ # Download in parallel with progress bar
163
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
164
+ # Submit all tasks
165
+ future_to_task = {
166
+ executor.submit(self.download_single_url, url, url_id): (url, url_id)
167
+ for url, url_id in tasks
168
+ }
169
+
170
+ # Process completed tasks with progress bar
171
+ with tqdm(total=len(tasks), desc="Downloading HTML", unit="url") as pbar:
172
+ for future in as_completed(future_to_task):
173
+ result = future.result()
174
+ results.append(result)
175
+ pbar.update(1)
176
+
177
+ # Update progress bar description with stats
178
+ pbar.set_postfix({
179
+ 'Success': self.stats['success'],
180
+ 'Failed': self.stats['failed'] + self.stats['timeout'] + self.stats['error']
181
+ })
182
+
183
+ # Create results DataFrame
184
+ results_df = pd.DataFrame(results)
185
+
186
+ # Print summary
187
+ self._print_summary(results_df)
188
+
189
+ return results_df
190
+
191
+ def _print_summary(self, results_df):
192
+ """Print download summary statistics."""
193
+ logger.info("\n" + "="*80)
194
+ logger.info("DOWNLOAD SUMMARY")
195
+ logger.info("="*80)
196
+
197
+ logger.info(f"\nTotal URLs processed: {self.stats['total']:,}")
198
+ logger.info(f" ✓ Successful: {self.stats['success']:,} ({self.stats['success']/max(self.stats['total'],1)*100:.1f}%)")
199
+ logger.info(f" ✗ Failed: {self.stats['failed']:,}")
200
+ logger.info(f" ⏱ Timeout: {self.stats['timeout']:,}")
201
+ logger.info(f" ⚠ Error: {self.stats['error']:,}")
202
+
203
+ # Only show detailed stats if we have results
204
+ if not results_df.empty and 'status' in results_df.columns:
205
+ # Total size downloaded
206
+ successful_downloads = results_df[results_df['status'] == 'success']
207
+ if not successful_downloads.empty:
208
+ total_size = successful_downloads['size'].sum()
209
+ logger.info(f"\nTotal HTML downloaded: {total_size/1024/1024:.2f} MB")
210
+
211
+ logger.info("="*80)
212
+
213
+
214
+ def main():
215
+ """Main function to download HTML from legitimate URLs."""
216
+ import argparse
217
+
218
+ parser = argparse.ArgumentParser(description='Download HTML content from legitimate URLs')
219
+ parser.add_argument('--input', type=str, default='data/raw/legitimate.csv',
220
+ help='Input CSV file with legitimate URLs (default: top-1m.csv with 1M URLs)')
221
+ parser.add_argument('--output', type=str, default='data/html_legitimate',
222
+ help='Output directory for HTML files')
223
+ parser.add_argument('--limit', type=int, default=50000,
224
+ help='Number of URLs to download (default: 50000)')
225
+ parser.add_argument('--workers', type=int, default=20,
226
+ help='Number of parallel workers (default: 20)')
227
+ parser.add_argument('--timeout', type=int, default=10,
228
+ help='Timeout per URL in seconds (default: 10)')
229
+
230
+ args = parser.parse_args()
231
+
232
+ # Print header
233
+ logger.info("="*80)
234
+ logger.info("LEGITIMATE HTML DOWNLOADER - Phishing Detection")
235
+ logger.info("="*80)
236
+
237
+ # Load URLs
238
+ script_dir = Path(__file__).parent.parent
239
+ input_path = (script_dir / args.input).resolve()
240
+
241
+ logger.info(f"\nLoading URLs from: {input_path}")
242
+ df = pd.read_csv(input_path)
243
+ logger.info(f"Loaded: {len(df):,} URLs")
244
+
245
+ # Show columns
246
+ logger.info(f"Columns: {list(df.columns)}")
247
+
248
+ # Limit number of URLs
249
+ if args.limit:
250
+ df = df.head(args.limit)
251
+ logger.info(f"Limited to first {args.limit:,} URLs")
252
+
253
+ # Initialize downloader
254
+ output_dir = (script_dir / args.output).resolve()
255
+ downloader = LegitimateHTMLDownloader(
256
+ output_dir=output_dir,
257
+ max_workers=args.workers,
258
+ timeout=args.timeout
259
+ )
260
+
261
+ # Download
262
+ results_df = downloader.download_batch(
263
+ df,
264
+ id_column='id' if 'id' in df.columns else None
265
+ )
266
+
267
+ # Save results
268
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
269
+ results_path = output_dir / f'download_results_{timestamp}.csv'
270
+ results_df.to_csv(results_path, index=False)
271
+ logger.info(f"\n✓ Results saved to: {results_path}")
272
+
273
+ # Save successful downloads metadata
274
+ successful = results_df[results_df['status'] == 'success']
275
+ if len(successful) > 0:
276
+ metadata_path = output_dir / 'html_metadata.csv'
277
+ successful[['url', 'filename', 'size']].to_csv(metadata_path, index=False)
278
+ logger.info(f"✓ Metadata saved to: {metadata_path}")
279
+
280
+ logger.info("\n" + "="*80)
281
+ logger.info("✓ LEGITIMATE HTML DOWNLOAD COMPLETE!")
282
+ logger.info("="*80)
283
+
284
+
285
+ if __name__ == '__main__':
286
+ main()
scripts/feature_extraction/__pycache__/extract_combined_features.cpython-313.pyc ADDED
Binary file (17 kB). View file
 
scripts/feature_extraction/__pycache__/html_features.cpython-313.pyc ADDED
Binary file (21.8 kB). View file
 
scripts/feature_extraction/__pycache__/url_features.cpython-313.pyc ADDED
Binary file (61.2 kB). View file
 
scripts/feature_extraction/__pycache__/url_features_optimized.cpython-313.pyc ADDED
Binary file (51 kB). View file
 
scripts/feature_extraction/__pycache__/url_features_v2.cpython-313.pyc ADDED
Binary file (61.3 kB). View file
 
scripts/feature_extraction/__pycache__/url_features_v3.cpython-313.pyc ADDED
Binary file (50.8 kB). View file
 
scripts/feature_extraction/extract_combined_features.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Combined URL + HTML Feature Extraction from clean_dataset.csv
3
+
4
+ Reads URLs from clean_dataset.csv, extracts URL features and downloads HTML
5
+ to extract HTML features, combines them into a single feature dataset.
6
+ Produces a balanced combined_features.csv.
7
+
8
+ Usage:
9
+ python scripts/feature_extraction/extract_combined_features.py
10
+ python scripts/feature_extraction/extract_combined_features.py --workers 20 --timeout 15
11
+ python scripts/feature_extraction/extract_combined_features.py --limit 1000 --no-balance
12
+ """
13
+ import argparse
14
+ import logging
15
+ import random
16
+ import sys
17
+ import time
18
+ import warnings
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from pathlib import Path
21
+ from threading import Lock
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+ import requests
26
+ import urllib3
27
+ from tqdm import tqdm
28
+
29
+ # Suppress SSL warnings (phishing sites often have invalid certs)
30
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
31
+ warnings.filterwarnings('ignore', message='.*Unverified HTTPS.*')
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Project setup
35
+ # ---------------------------------------------------------------------------
36
+ PROJECT_ROOT = Path(__file__).resolve().parents[2] # src/
37
+ sys.path.insert(0, str(PROJECT_ROOT))
38
+
39
+ from scripts.feature_extraction.url.url_features_v3 import URLFeatureExtractorOptimized
40
+ from scripts.feature_extraction.html.html_feature_extractor import HTMLFeatureExtractor
41
+ from scripts.feature_extraction.html.feature_engineering import engineer_features
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Logging
45
+ # ---------------------------------------------------------------------------
46
+ logging.basicConfig(
47
+ level=logging.INFO,
48
+ format='%(asctime)s - %(levelname)s - %(message)s',
49
+ datefmt='%H:%M:%S',
50
+ )
51
+ logger = logging.getLogger('extract_combined')
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # Constants
55
+ # ---------------------------------------------------------------------------
56
+ HEADERS = {
57
+ 'User-Agent': (
58
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
59
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
60
+ ),
61
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62
+ 'Accept-Language': 'en-US,en;q=0.5',
63
+ }
64
+
65
+ CHECKPOINT_FILE = PROJECT_ROOT / 'data' / 'features' / '_combined_checkpoint.csv'
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Feature extraction for a single URL (runs in thread)
70
+ # ---------------------------------------------------------------------------
71
+ def extract_single(
72
+ url: str,
73
+ label: int,
74
+ url_extractor: URLFeatureExtractorOptimized,
75
+ html_extractor: HTMLFeatureExtractor,
76
+ timeout: int = 10,
77
+ ) -> dict | None:
78
+ """
79
+ Extract URL + HTML features for a single URL.
80
+
81
+ Returns:
82
+ Combined feature dict with url, label, and all features,
83
+ or None on total failure.
84
+ """
85
+ result = {'url': url, 'label': label}
86
+
87
+ # --- 1. URL features (always succeeds) ---
88
+ try:
89
+ url_feats = url_extractor.extract_features(url)
90
+ for k, v in url_feats.items():
91
+ result[f'url_{k}'] = v
92
+ except Exception as e:
93
+ logger.debug(f"URL feature error for {url}: {e}")
94
+ return None
95
+
96
+ # --- 2. Download HTML & extract HTML features ---
97
+ html_ok = False
98
+ try:
99
+ resp = requests.get(
100
+ url, timeout=timeout, verify=False, headers=HEADERS,
101
+ allow_redirects=True,
102
+ )
103
+ if resp.status_code == 200 and len(resp.text) > 200:
104
+ raw_feats = html_extractor.extract_features(resp.text)
105
+ # Apply feature engineering
106
+ raw_df = pd.DataFrame([raw_feats])
107
+ eng_df = engineer_features(raw_df)
108
+ eng_row = eng_df.iloc[0].to_dict()
109
+ for k, v in eng_row.items():
110
+ result[f'html_{k}'] = v
111
+ html_ok = True
112
+ except Exception:
113
+ pass
114
+
115
+ if not html_ok:
116
+ # Fill HTML features with zeros
117
+ dummy_html = html_extractor.extract_features('')
118
+ dummy_df = pd.DataFrame([dummy_html])
119
+ eng_df = engineer_features(dummy_df)
120
+ for k in eng_df.columns:
121
+ result[f'html_{k}'] = 0
122
+
123
+ return result
124
+
125
+
126
+ # ---------------------------------------------------------------------------
127
+ # Batch extraction with threading + checkpointing
128
+ # ---------------------------------------------------------------------------
129
+ def extract_all(
130
+ df: pd.DataFrame,
131
+ max_workers: int = 10,
132
+ timeout: int = 10,
133
+ checkpoint_every: int = 500,
134
+ ) -> pd.DataFrame:
135
+ """
136
+ Extract combined features for all URLs using thread pool.
137
+
138
+ Args:
139
+ df: DataFrame with 'url' and 'label' columns.
140
+ max_workers: Parallel download threads.
141
+ timeout: HTTP timeout per URL (seconds).
142
+ checkpoint_every: Save intermediate results every N rows.
143
+
144
+ Returns:
145
+ DataFrame with combined features.
146
+ """
147
+ url_extractor = URLFeatureExtractorOptimized()
148
+ html_extractor = HTMLFeatureExtractor()
149
+
150
+ urls = df['url'].tolist()
151
+ labels = df['label'].tolist()
152
+ total = len(urls)
153
+
154
+ # --- Load checkpoint if exists ---
155
+ done_urls = set()
156
+ results = []
157
+ if CHECKPOINT_FILE.exists():
158
+ ckpt = pd.read_csv(CHECKPOINT_FILE)
159
+ done_urls = set(ckpt['url'].tolist())
160
+ results = ckpt.to_dict('records')
161
+ logger.info(f"Resuming from checkpoint: {len(done_urls):,} URLs already done")
162
+
163
+ remaining = [(u, l) for u, l in zip(urls, labels) if u not in done_urls]
164
+ logger.info(f"Remaining URLs to process: {len(remaining):,} / {total:,}")
165
+
166
+ if not remaining:
167
+ logger.info("All URLs already processed!")
168
+ return pd.DataFrame(results)
169
+
170
+ lock = Lock()
171
+ n_success = 0
172
+ n_html_fail = 0
173
+ n_fail = 0
174
+ t_start = time.perf_counter()
175
+
176
+ def _worker(url_label):
177
+ u, l = url_label
178
+ return extract_single(u, l, url_extractor, html_extractor, timeout)
179
+
180
+ with ThreadPoolExecutor(max_workers=max_workers) as pool:
181
+ futures = {pool.submit(_worker, item): item for item in remaining}
182
+
183
+ with tqdm(total=len(remaining), desc='Extracting', unit='url') as pbar:
184
+ for future in as_completed(futures):
185
+ pbar.update(1)
186
+ result = future.result()
187
+
188
+ with lock:
189
+ if result is not None:
190
+ results.append(result)
191
+ n_success += 1
192
+
193
+ # Check if HTML was zero-filled
194
+ if result.get('html_num_tags', 0) == 0:
195
+ n_html_fail += 1
196
+ else:
197
+ n_fail += 1
198
+
199
+ # Checkpoint
200
+ if len(results) % checkpoint_every == 0:
201
+ _save_checkpoint(results)
202
+
203
+ elapsed = time.perf_counter() - t_start
204
+ speed = len(remaining) / elapsed if elapsed > 0 else 0
205
+
206
+ logger.info(f"\nExtraction complete in {elapsed:.1f}s ({speed:.0f} URLs/sec)")
207
+ logger.info(f" Successful: {n_success:,}")
208
+ logger.info(f" HTML download failed (zero-filled): {n_html_fail:,}")
209
+ logger.info(f" Total failures (skipped): {n_fail:,}")
210
+
211
+ # Final checkpoint
212
+ _save_checkpoint(results)
213
+
214
+ return pd.DataFrame(results)
215
+
216
+
217
+ def _save_checkpoint(results: list):
218
+ """Save intermediate results to checkpoint file."""
219
+ CHECKPOINT_FILE.parent.mkdir(parents=True, exist_ok=True)
220
+ pd.DataFrame(results).to_csv(CHECKPOINT_FILE, index=False)
221
+
222
+
223
+ # ---------------------------------------------------------------------------
224
+ # Balance dataset
225
+ # ---------------------------------------------------------------------------
226
+ def balance_dataset(df: pd.DataFrame, random_state: int = 42) -> pd.DataFrame:
227
+ """Undersample majority class to balance the dataset."""
228
+ counts = df['label'].value_counts()
229
+ min_count = counts.min()
230
+ logger.info(f"Balancing: {counts.to_dict()} → {min_count:,} per class")
231
+
232
+ balanced = (
233
+ df.groupby('label', group_keys=False)
234
+ .apply(lambda g: g.sample(n=min_count, random_state=random_state))
235
+ )
236
+ balanced = balanced.sample(frac=1, random_state=random_state).reset_index(drop=True)
237
+ return balanced
238
+
239
+
240
+ # ---------------------------------------------------------------------------
241
+ # Main
242
+ # ---------------------------------------------------------------------------
243
+ def main():
244
+ parser = argparse.ArgumentParser(
245
+ description='Extract combined URL + HTML features from clean_dataset.csv')
246
+ parser.add_argument('--input', type=str,
247
+ default='data/processed/clean_dataset.csv',
248
+ help='Input CSV with url,label columns')
249
+ parser.add_argument('--output', type=str,
250
+ default='data/features/combined_features.csv',
251
+ help='Output CSV path')
252
+ parser.add_argument('--workers', type=int, default=10,
253
+ help='Parallel download threads (default: 10)')
254
+ parser.add_argument('--timeout', type=int, default=10,
255
+ help='HTTP timeout in seconds (default: 10)')
256
+ parser.add_argument('--limit', type=int, default=None,
257
+ help='Limit total URLs (for testing)')
258
+ parser.add_argument('--checkpoint-every', type=int, default=500,
259
+ help='Save checkpoint every N URLs (default: 500)')
260
+ parser.add_argument('--no-balance', action='store_true',
261
+ help='Do not balance the output dataset')
262
+ args = parser.parse_args()
263
+
264
+ input_path = (PROJECT_ROOT / args.input).resolve()
265
+ output_path = (PROJECT_ROOT / args.output).resolve()
266
+
267
+ logger.info("=" * 70)
268
+ logger.info("COMBINED URL + HTML FEATURE EXTRACTION")
269
+ logger.info("=" * 70)
270
+ logger.info(f" Input: {input_path}")
271
+ logger.info(f" Output: {output_path}")
272
+ logger.info(f" Workers: {args.workers}")
273
+ logger.info(f" Timeout: {args.timeout}s")
274
+ logger.info(f" Balance: {'YES' if not args.no_balance else 'NO'}")
275
+
276
+ # --- Load dataset ---
277
+ df = pd.read_csv(input_path)
278
+ logger.info(f"\nLoaded {len(df):,} URLs")
279
+ logger.info(f" Label distribution: {df['label'].value_counts().to_dict()}")
280
+
281
+ if args.limit:
282
+ # Stratified limit
283
+ per_class = args.limit // 2
284
+ df = (
285
+ df.groupby('label', group_keys=False)
286
+ .apply(lambda g: g.sample(n=min(per_class, len(g)), random_state=42))
287
+ )
288
+ df = df.reset_index(drop=True)
289
+ logger.info(f" Limited to: {len(df):,} URLs")
290
+
291
+ # --- Extract features ---
292
+ features_df = extract_all(
293
+ df,
294
+ max_workers=args.workers,
295
+ timeout=args.timeout,
296
+ checkpoint_every=args.checkpoint_every,
297
+ )
298
+
299
+ if features_df.empty:
300
+ logger.error("No features extracted!")
301
+ sys.exit(1)
302
+
303
+ logger.info(f"\nExtracted features: {features_df.shape}")
304
+ logger.info(f" Label distribution: {features_df['label'].value_counts().to_dict()}")
305
+
306
+ # --- Balance ---
307
+ if not args.no_balance:
308
+ features_df = balance_dataset(features_df)
309
+ logger.info(f" After balancing: {features_df.shape}")
310
+ logger.info(f" Label dist: {features_df['label'].value_counts().to_dict()}")
311
+
312
+ # --- Reorder columns: url, label first, then sorted features ---
313
+ meta_cols = ['url', 'label']
314
+ feature_cols = sorted([c for c in features_df.columns if c not in meta_cols])
315
+ features_df = features_df[meta_cols + feature_cols]
316
+
317
+ # --- Clean up infinities / NaNs ---
318
+ features_df = features_df.replace([np.inf, -np.inf], 0)
319
+ features_df = features_df.fillna(0)
320
+
321
+ # --- Save ---
322
+ output_path.parent.mkdir(parents=True, exist_ok=True)
323
+ features_df.to_csv(output_path, index=False)
324
+
325
+ # --- Cleanup checkpoint ---
326
+ if CHECKPOINT_FILE.exists():
327
+ CHECKPOINT_FILE.unlink()
328
+ logger.info("Checkpoint file cleaned up")
329
+
330
+ # --- Summary ---
331
+ logger.info("\n" + "=" * 70)
332
+ logger.info("EXTRACTION COMPLETE")
333
+ logger.info("=" * 70)
334
+ logger.info(f" Total samples: {len(features_df):,}")
335
+ logger.info(f" Legitimate: {(features_df['label'] == 0).sum():,}")
336
+ logger.info(f" Phishing: {(features_df['label'] == 1).sum():,}")
337
+ logger.info(f" Total features: {len(feature_cols)}")
338
+ url_feats = [c for c in feature_cols if c.startswith('url_')]
339
+ html_feats = [c for c in feature_cols if c.startswith('html_')]
340
+ logger.info(f" URL features: {len(url_feats)}")
341
+ logger.info(f" HTML features: {len(html_feats)}")
342
+ logger.info(f" Output: {output_path}")
343
+ logger.info("=" * 70)
344
+
345
+
346
+ if __name__ == '__main__':
347
+ main()
scripts/feature_extraction/html/__pycache__/feature_engineering.cpython-313.pyc ADDED
Binary file (5.64 kB). View file
 
scripts/feature_extraction/html/__pycache__/html_feature_extractor.cpython-313.pyc ADDED
Binary file (25.4 kB). View file
 
scripts/feature_extraction/html/extract_features.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Parallel HTML Feature Extraction Pipeline
3
+
4
+ Processes ~80k HTML files using multiprocessing for CPU-bound parsing.
5
+ Integrates quality filtering INTO the same parse pass (no double-parsing).
6
+ Includes checkpointing, progress tracking, and balanced output.
7
+
8
+ Usage:
9
+ python scripts/feature_extraction/html/extract_features.py
10
+ python scripts/feature_extraction/html/extract_features.py --no-filter
11
+ python scripts/feature_extraction/html/extract_features.py --workers 8
12
+ """
13
+ import argparse
14
+ import json
15
+ import logging
16
+ import sys
17
+ import time
18
+ from concurrent.futures import ProcessPoolExecutor, as_completed
19
+ from pathlib import Path
20
+
21
+ import pandas as pd
22
+ from tqdm import tqdm
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Resolve project root so imports work regardless of cwd
26
+ # ---------------------------------------------------------------------------
27
+ PROJECT_ROOT = Path(__file__).resolve().parents[3] # src/
28
+ sys.path.insert(0, str(PROJECT_ROOT))
29
+
30
+ from scripts.feature_extraction.html.html_feature_extractor import HTMLFeatureExtractor
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Logging
34
+ # ---------------------------------------------------------------------------
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format='%(asctime)s - %(levelname)s - %(message)s',
38
+ datefmt='%H:%M:%S',
39
+ )
40
+ logger = logging.getLogger('extract_features')
41
+
42
+ # ---------------------------------------------------------------------------
43
+ # Quality filter constants
44
+ # ---------------------------------------------------------------------------
45
+ MIN_FILE_SIZE = 800 # bytes
46
+ MIN_TAGS = 8
47
+ MIN_WORDS = 30
48
+ ERROR_PATTERNS = [
49
+ 'page not found', '404 not found', '403 forbidden',
50
+ 'access denied', 'server error', 'not available',
51
+ 'domain for sale', 'website expired', 'coming soon',
52
+ 'under construction', 'parked domain', 'buy this domain',
53
+ 'domain has expired', 'this site can',
54
+ ]
55
+
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Worker function – runs in a subprocess
59
+ # ---------------------------------------------------------------------------
60
+ def _process_file(args: tuple) -> dict | None:
61
+ """
62
+ Process a single HTML file: read → (optionally filter) → extract.
63
+
64
+ Designed to run inside ProcessPoolExecutor – must be picklable
65
+ (top-level function, not a method).
66
+
67
+ Args:
68
+ args: (file_path_str, label, apply_filter)
69
+
70
+ Returns:
71
+ Feature dict with 'filename' and 'label' added, or None on skip/error.
72
+ """
73
+ file_path_str, label, apply_filter = args
74
+
75
+ try:
76
+ path = Path(file_path_str)
77
+
78
+ # --- Read file ---
79
+ raw = path.read_text(encoding='utf-8', errors='ignore')
80
+
81
+ # --- Quick pre-filter (before expensive parse) ---
82
+ if apply_filter and len(raw) < MIN_FILE_SIZE:
83
+ return None
84
+
85
+ # --- Parse once with lxml ---
86
+ from bs4 import BeautifulSoup
87
+ try:
88
+ soup = BeautifulSoup(raw, 'lxml')
89
+ except Exception:
90
+ soup = BeautifulSoup(raw, 'html.parser')
91
+
92
+ # --- Quality filter (uses the already-parsed soup) ---
93
+ if apply_filter:
94
+ if not soup.find('body'):
95
+ return None
96
+
97
+ all_tags = soup.find_all()
98
+ if len(all_tags) < MIN_TAGS:
99
+ return None
100
+
101
+ text = soup.get_text(separator=' ', strip=True).lower()
102
+ words = text.split()
103
+ if len(words) < MIN_WORDS:
104
+ return None
105
+
106
+ # Check error-page patterns (first 2000 chars only)
107
+ text_head = text[:2000]
108
+ for pat in ERROR_PATTERNS:
109
+ if pat in text_head:
110
+ return None
111
+
112
+ # Must have at least some content elements
113
+ has_content = (
114
+ len(soup.find_all('a')) > 0 or
115
+ len(soup.find_all('form')) > 0 or
116
+ len(soup.find_all('input')) > 0 or
117
+ len(soup.find_all('img')) > 0 or
118
+ len(soup.find_all('div')) > 3
119
+ )
120
+ if not has_content:
121
+ return None
122
+
123
+ # --- Extract features (re-parses internally with cache) ---
124
+ extractor = HTMLFeatureExtractor()
125
+ features = extractor.extract_features(raw)
126
+ features['filename'] = path.name
127
+ features['label'] = label
128
+ return features
129
+
130
+ except Exception:
131
+ return None
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Directory processor
136
+ # ---------------------------------------------------------------------------
137
+ def extract_from_directory(
138
+ html_dir: Path,
139
+ label: int,
140
+ apply_filter: bool = True,
141
+ max_workers: int = 6,
142
+ limit: int | None = None,
143
+ ) -> list[dict]:
144
+ """
145
+ Extract features from all .html files in a directory using multiprocessing.
146
+
147
+ Args:
148
+ html_dir: Directory with .html files
149
+ label: 0 = legitimate, 1 = phishing
150
+ apply_filter: Apply quality filter
151
+ max_workers: Number of parallel workers
152
+ limit: Max files to return (None = all)
153
+
154
+ Returns:
155
+ List of feature dictionaries
156
+ """
157
+ html_files = sorted(html_dir.glob('*.html'))
158
+ total = len(html_files)
159
+ label_name = 'Phishing' if label == 1 else 'Legitimate'
160
+
161
+ logger.info(f"\n{'='*60}")
162
+ logger.info(f"Processing {label_name}: {total:,} files")
163
+ logger.info(f" Directory: {html_dir}")
164
+ logger.info(f" Quality filter: {'ON' if apply_filter else 'OFF'}")
165
+ logger.info(f" Workers: {max_workers}")
166
+ logger.info(f"{'='*60}")
167
+
168
+ # Build task list
169
+ tasks = [(str(f), label, apply_filter) for f in html_files]
170
+
171
+ results = []
172
+ n_filtered = 0
173
+ t0 = time.perf_counter()
174
+
175
+ with ProcessPoolExecutor(max_workers=max_workers) as pool:
176
+ futures = {pool.submit(_process_file, t): t for t in tasks}
177
+
178
+ with tqdm(total=total, desc=f'{label_name}', unit='file') as pbar:
179
+ for future in as_completed(futures):
180
+ pbar.update(1)
181
+ result = future.result()
182
+ if result is None:
183
+ n_filtered += 1
184
+ else:
185
+ results.append(result)
186
+ if limit and len(results) >= limit:
187
+ # Cancel remaining futures
188
+ for f in futures:
189
+ f.cancel()
190
+ break
191
+
192
+ elapsed = time.perf_counter() - t0
193
+ speed = total / elapsed if elapsed > 0 else 0
194
+
195
+ logger.info(f" Extracted: {len(results):,} quality samples")
196
+ logger.info(f" Filtered out: {n_filtered:,} ({n_filtered/max(total,1)*100:.1f}%)")
197
+ logger.info(f" Time: {elapsed:.1f}s ({speed:.0f} files/sec)")
198
+
199
+ return results
200
+
201
+
202
+ # ---------------------------------------------------------------------------
203
+ # Main
204
+ # ---------------------------------------------------------------------------
205
+ def main():
206
+ parser = argparse.ArgumentParser(
207
+ description='Extract HTML features for phishing detection (parallel)')
208
+ parser.add_argument('--phishing-dir', type=str, nargs='+',
209
+ default=['data/html/phishing', 'data/html/phishing_v1'],
210
+ help='Directories with phishing HTML files')
211
+ parser.add_argument('--legit-dir', type=str, nargs='+',
212
+ default=['data/html/legitimate', 'data/html/legitimate_v1'],
213
+ help='Directories with legitimate HTML files')
214
+ parser.add_argument('--output', type=str, default='data/features/html_features.csv',
215
+ help='Output CSV path')
216
+ parser.add_argument('--workers', type=int, default=6,
217
+ help='Number of parallel workers (default: 6)')
218
+ parser.add_argument('--no-filter', action='store_true',
219
+ help='Disable quality filtering')
220
+ parser.add_argument('--limit', type=int, default=None,
221
+ help='Limit samples per class (for testing)')
222
+ parser.add_argument('--no-balance', action='store_true',
223
+ help='Do not balance classes')
224
+ args = parser.parse_args()
225
+
226
+ apply_filter = not args.no_filter
227
+
228
+ # Resolve paths relative to project root
229
+ phishing_dirs = [(PROJECT_ROOT / d).resolve() for d in args.phishing_dir]
230
+ legit_dirs = [(PROJECT_ROOT / d).resolve() for d in args.legit_dir]
231
+ output_path = (PROJECT_ROOT / args.output).resolve()
232
+
233
+ logger.info("=" * 70)
234
+ logger.info("HTML FEATURE EXTRACTION PIPELINE")
235
+ logger.info("=" * 70)
236
+ for d in phishing_dirs:
237
+ logger.info(f" Phishing dir: {d}")
238
+ for d in legit_dirs:
239
+ logger.info(f" Legitimate dir: {d}")
240
+ logger.info(f" Output: {output_path}")
241
+ logger.info(f" Workers: {args.workers}")
242
+ logger.info(f" Quality filter: {'ON' if apply_filter else 'OFF'}")
243
+
244
+ # Validate directories
245
+ for d in phishing_dirs:
246
+ if not d.exists():
247
+ logger.warning(f"Phishing directory not found (skipping): {d}")
248
+ for d in legit_dirs:
249
+ if not d.exists():
250
+ logger.warning(f"Legitimate directory not found (skipping): {d}")
251
+
252
+ # ---- Extract features ----
253
+ t_start = time.perf_counter()
254
+
255
+ phishing_features = []
256
+ for d in phishing_dirs:
257
+ if d.exists():
258
+ phishing_features.extend(extract_from_directory(
259
+ d, label=1, apply_filter=apply_filter,
260
+ max_workers=args.workers, limit=args.limit))
261
+
262
+ legit_features = []
263
+ for d in legit_dirs:
264
+ if d.exists():
265
+ legit_features.extend(extract_from_directory(
266
+ d, label=0, apply_filter=apply_filter,
267
+ max_workers=args.workers, limit=args.limit))
268
+
269
+ # ---- Balance ----
270
+ if not args.no_balance:
271
+ min_count = min(len(phishing_features), len(legit_features))
272
+ logger.info(f"\nBalancing to {min_count:,} per class")
273
+ # Shuffle before truncating to get random sample
274
+ import random
275
+ random.seed(42)
276
+ random.shuffle(phishing_features)
277
+ random.shuffle(legit_features)
278
+ phishing_features = phishing_features[:min_count]
279
+ legit_features = legit_features[:min_count]
280
+
281
+ # ---- Build DataFrame ----
282
+ all_features = phishing_features + legit_features
283
+ if not all_features:
284
+ logger.error("No features extracted!")
285
+ sys.exit(1)
286
+
287
+ df = pd.DataFrame(all_features)
288
+
289
+ # Reorder columns: metadata first, then sorted features
290
+ meta_cols = ['filename', 'label']
291
+ feature_cols = sorted([c for c in df.columns if c not in meta_cols])
292
+ df = df[meta_cols + feature_cols]
293
+
294
+ # Shuffle rows
295
+ df = df.sample(frac=1, random_state=42).reset_index(drop=True)
296
+
297
+ # ---- Save ----
298
+ output_path.parent.mkdir(parents=True, exist_ok=True)
299
+ df.to_csv(output_path, index=False)
300
+
301
+ elapsed = time.perf_counter() - t_start
302
+
303
+ # ---- Summary ----
304
+ logger.info("\n" + "=" * 70)
305
+ logger.info("EXTRACTION COMPLETE")
306
+ logger.info("=" * 70)
307
+ logger.info(f" Total samples: {len(df):,}")
308
+ logger.info(f" Phishing: {(df['label']==1).sum():,}")
309
+ logger.info(f" Legitimate: {(df['label']==0).sum():,}")
310
+ logger.info(f" Features: {len(feature_cols)}")
311
+ logger.info(f" Total time: {elapsed:.1f}s")
312
+ logger.info(f" Output: {output_path}")
313
+ logger.info("=" * 70)
314
+
315
+ # Quick stats
316
+ numeric = df[feature_cols].describe().T[['mean', 'std', 'min', 'max']]
317
+ logger.info(f"\nFeature statistics (sample):")
318
+ logger.info(numeric.head(15).to_string())
319
+
320
+
321
+ if __name__ == '__main__':
322
+ main()
scripts/feature_extraction/html/feature_engineering.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared Feature Engineering for HTML-based Phishing Detection
3
+
4
+ Creates derived features from raw HTML features to improve model performance.
5
+ Used by both XGBoost and Random Forest training pipelines.
6
+ """
7
+ import numpy as np
8
+ import pandas as pd
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def engineer_features(X: pd.DataFrame) -> pd.DataFrame:
15
+ """
16
+ Create engineered features from raw HTML features.
17
+
18
+ Adds ratio features, interaction terms and risk scores
19
+ that capture phishing-specific patterns.
20
+
21
+ Args:
22
+ X: DataFrame with raw feature columns (no 'label'/'filename')
23
+
24
+ Returns:
25
+ DataFrame with original + engineered features (inf replaced by 0)
26
+ """
27
+ X = X.copy()
28
+
29
+ # ---- Ratio features (division guarded by +1) ----
30
+ X['forms_to_inputs_ratio'] = X['num_forms'] / (X['num_input_fields'] + 1)
31
+ X['external_to_total_links'] = X['num_external_links'] / (X['num_links'] + 1)
32
+ X['scripts_to_tags_ratio'] = X['num_scripts'] / (X['num_tags'] + 1)
33
+ X['hidden_to_visible_inputs'] = X['num_hidden_fields'] / (X['num_input_fields'] + 1)
34
+ X['password_to_inputs_ratio'] = X['num_password_fields'] / (X['num_input_fields'] + 1)
35
+ X['empty_to_total_links'] = X['num_empty_links'] / (X['num_links'] + 1)
36
+ X['images_to_tags_ratio'] = X['num_images'] / (X['num_tags'] + 1)
37
+ X['iframes_to_tags_ratio'] = X['num_iframes'] / (X['num_tags'] + 1)
38
+
39
+ # ---- Interaction features (suspicious combinations) ----
40
+ X['forms_with_passwords'] = X['num_forms'] * X['num_password_fields']
41
+ X['external_scripts_links'] = X['num_external_links'] * X['num_external_scripts']
42
+ X['urgency_with_forms'] = X['num_urgency_keywords'] * X['num_forms']
43
+ X['brand_with_forms'] = X['num_brand_mentions'] * X['num_forms']
44
+ X['iframes_with_scripts'] = X['num_iframes'] * X['num_scripts']
45
+ X['hidden_with_external'] = X['num_hidden_fields'] * X['num_external_form_actions']
46
+
47
+ # ---- Content density features ----
48
+ X['content_density'] = (X['text_length'] + 1) / (X['num_divs'] + X['num_spans'] + 1)
49
+ X['form_density'] = X['num_forms'] / (X['num_divs'] + 1)
50
+ X['scripts_per_form'] = X['num_scripts'] / (X['num_forms'] + 1)
51
+ X['links_per_word'] = X['num_links'] / (X['num_words'] + 1)
52
+
53
+ # ---- Risk scores ----
54
+ X['phishing_risk_score'] = (
55
+ X['num_urgency_keywords'] * 2 +
56
+ X['num_brand_mentions'] * 2 +
57
+ X['num_password_fields'] * 3 +
58
+ X['num_iframes'] * 2 +
59
+ X.get('num_hidden_iframes', 0) * 4 +
60
+ X.get('num_anchor_text_mismatch', 0) * 3 +
61
+ X.get('num_suspicious_tld_links', 0) * 2 +
62
+ X.get('has_login_form', 0) * 3
63
+ )
64
+
65
+ X['form_risk_score'] = (
66
+ X['num_password_fields'] * 3 +
67
+ X['num_external_form_actions'] * 2 +
68
+ X['num_empty_form_actions'] +
69
+ X['num_hidden_fields']
70
+ )
71
+
72
+ X['obfuscation_score'] = (
73
+ X['has_eval'] +
74
+ X['has_unescape'] +
75
+ X['has_escape'] +
76
+ X['has_document_write'] +
77
+ X.get('has_base64', 0) +
78
+ X.get('has_atob', 0) +
79
+ X.get('has_fromcharcode', 0)
80
+ )
81
+
82
+ X['legitimacy_score'] = (
83
+ X['has_title'] +
84
+ X.get('has_description', 0) +
85
+ X.get('has_viewport', 0) +
86
+ X.get('has_favicon', 0) +
87
+ X.get('has_copyright', 0) +
88
+ X.get('has_author', 0) +
89
+ (X['num_meta_tags'] > 3).astype(int) +
90
+ (X['num_css_files'] > 0).astype(int)
91
+ )
92
+
93
+ # ---- Boolean aggregations ----
94
+ X['has_suspicious_elements'] = (
95
+ (X.get('has_meta_refresh', 0) == 1) |
96
+ (X['num_iframes'] > 0) |
97
+ (X['num_hidden_fields'] > 3) |
98
+ (X.get('has_location_replace', 0) == 1)
99
+ ).astype(int)
100
+
101
+ # ---- Clean up ----
102
+ X = X.replace([np.inf, -np.inf], 0)
103
+ X = X.fillna(0)
104
+
105
+ return X
106
+
107
+
108
+ def get_engineered_feature_names() -> list[str]:
109
+ """Return names of features added by engineer_features()."""
110
+ return [
111
+ # Ratios (8)
112
+ 'forms_to_inputs_ratio', 'external_to_total_links',
113
+ 'scripts_to_tags_ratio', 'hidden_to_visible_inputs',
114
+ 'password_to_inputs_ratio', 'empty_to_total_links',
115
+ 'images_to_tags_ratio', 'iframes_to_tags_ratio',
116
+ # Interactions (6)
117
+ 'forms_with_passwords', 'external_scripts_links',
118
+ 'urgency_with_forms', 'brand_with_forms',
119
+ 'iframes_with_scripts', 'hidden_with_external',
120
+ # Content density (4)
121
+ 'content_density', 'form_density', 'scripts_per_form', 'links_per_word',
122
+ # Risk scores (4)
123
+ 'phishing_risk_score', 'form_risk_score',
124
+ 'obfuscation_score', 'legitimacy_score',
125
+ # Boolean (1)
126
+ 'has_suspicious_elements',
127
+ ]
scripts/feature_extraction/html/html_feature_extractor.py ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Optimized HTML Feature Extractor for Phishing Detection
3
+
4
+ Extracts 67 features from HTML content with single-parse efficiency.
5
+ Uses cached tag lookups to avoid redundant find_all() calls.
6
+ """
7
+ import re
8
+ from urllib.parse import urlparse
9
+ from bs4 import BeautifulSoup
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Suspicious TLDs commonly used in phishing
15
+ SUSPICIOUS_TLDS = {
16
+ '.tk', '.ml', '.ga', '.cf', '.gq', '.top', '.xyz', '.buzz',
17
+ '.club', '.online', '.site', '.icu', '.work', '.click', '.link',
18
+ '.info', '.pw', '.cc', '.ws', '.bid', '.stream', '.racing',
19
+ }
20
+
21
+ # Brand keywords phishers commonly impersonate
22
+ BRAND_KEYWORDS = [
23
+ 'paypal', 'amazon', 'google', 'microsoft', 'apple', 'facebook',
24
+ 'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'yahoo',
25
+ 'bank', 'visa', 'mastercard', 'americanexpress', 'chase', 'wells',
26
+ 'citibank', 'dhl', 'fedex', 'ups', 'usps', 'dropbox', 'adobe',
27
+ 'spotify', 'whatsapp', 'telegram', 'steam', 'coinbase', 'binance',
28
+ ]
29
+
30
+ # Urgency / social engineering keywords
31
+ URGENCY_KEYWORDS = [
32
+ 'urgent', 'verify', 'suspended', 'locked', 'confirm',
33
+ 'security', 'alert', 'warning', 'expire', 'limited',
34
+ 'immediately', 'click here', 'act now', 'unusual activity',
35
+ 'unauthorized', 'restricted', 'risk', 'compromised',
36
+ 'your account', 'update your', 'verify your', 'confirm your',
37
+ 'within 24', 'within 48', 'action required',
38
+ ]
39
+
40
+
41
+ class HTMLFeatureExtractor:
42
+ """
43
+ High-performance HTML feature extractor.
44
+
45
+ Parses HTML once and caches all tag lookups for efficiency.
46
+ Designed for batch processing of 40k+ files.
47
+ """
48
+
49
+ def extract_features(self, html_content: str, url: str | None = None) -> dict:
50
+ """
51
+ Extract all features from HTML content in a single pass.
52
+
53
+ Args:
54
+ html_content: Raw HTML string
55
+ url: Optional source URL for context
56
+
57
+ Returns:
58
+ Dictionary with 67 numeric features
59
+ """
60
+ try:
61
+ # --- Single parse with fast parser ---
62
+ try:
63
+ soup = BeautifulSoup(html_content, 'lxml')
64
+ except Exception:
65
+ soup = BeautifulSoup(html_content, 'html.parser')
66
+
67
+ # --- Cache tag lookups (done ONCE) ---
68
+ cache = self._build_cache(soup)
69
+
70
+ features = {}
71
+ features.update(self._structure_features(soup, cache, html_content))
72
+ features.update(self._form_features(cache))
73
+ features.update(self._link_features(cache))
74
+ features.update(self._script_features(cache))
75
+ features.update(self._text_features(soup, cache))
76
+ features.update(self._meta_features(soup, cache))
77
+ features.update(self._resource_features(cache))
78
+ features.update(self._advanced_features(soup, cache))
79
+ return features
80
+
81
+ except Exception as e:
82
+ logger.debug(f"Feature extraction error: {e}")
83
+ return self._default_features()
84
+
85
+ # ------------------------------------------------------------------
86
+ # Cache builder – avoids redundant find_all() across feature groups
87
+ # ------------------------------------------------------------------
88
+ @staticmethod
89
+ def _build_cache(soup) -> dict:
90
+ """Build a lookup cache of all tags we need. Called once per document."""
91
+ all_tags = soup.find_all()
92
+
93
+ # Classify tags by name in a single pass
94
+ by_name: dict[str, list] = {}
95
+ for tag in all_tags:
96
+ by_name.setdefault(tag.name, []).append(tag)
97
+
98
+ # Convenience lists used by multiple feature groups
99
+ links_a = by_name.get('a', [])
100
+ forms = by_name.get('form', [])
101
+ inputs = by_name.get('input', [])
102
+ scripts = by_name.get('script', [])
103
+ images = by_name.get('img', [])
104
+ iframes = by_name.get('iframe', [])
105
+ meta_tags = by_name.get('meta', [])
106
+ style_tags = by_name.get('style', [])
107
+ css_links = [t for t in by_name.get('link', [])
108
+ if t.get('rel') and 'stylesheet' in t.get('rel', [])]
109
+ all_link_tags = by_name.get('link', [])
110
+
111
+ # Pre-extract hrefs and input types (used in several groups)
112
+ hrefs = [a.get('href', '') or '' for a in links_a]
113
+ input_types = [(inp, (inp.get('type', '') or '').lower()) for inp in inputs]
114
+
115
+ return {
116
+ 'all_tags': all_tags,
117
+ 'by_name': by_name,
118
+ 'links_a': links_a,
119
+ 'hrefs': hrefs,
120
+ 'forms': forms,
121
+ 'inputs': inputs,
122
+ 'input_types': input_types,
123
+ 'scripts': scripts,
124
+ 'images': images,
125
+ 'iframes': iframes,
126
+ 'meta_tags': meta_tags,
127
+ 'style_tags': style_tags,
128
+ 'css_links': css_links,
129
+ 'all_link_tags': all_link_tags,
130
+ }
131
+
132
+ # ------------------------------------------------------------------
133
+ # 1. Structure features (12)
134
+ # ------------------------------------------------------------------
135
+ @staticmethod
136
+ def _structure_features(soup, c: dict, raw_html: str) -> dict:
137
+ bn = c['by_name']
138
+
139
+ # DOM depth – walk just the <body>
140
+ body = soup.find('body')
141
+ max_depth = 0
142
+ if body:
143
+ stack = [(body, 0)]
144
+ while stack:
145
+ node, depth = stack.pop()
146
+ if depth > max_depth:
147
+ max_depth = depth
148
+ for child in getattr(node, 'children', []):
149
+ if hasattr(child, 'name') and child.name:
150
+ stack.append((child, depth + 1))
151
+
152
+ return {
153
+ 'html_length': len(raw_html),
154
+ 'num_tags': len(c['all_tags']),
155
+ 'num_divs': len(bn.get('div', [])),
156
+ 'num_spans': len(bn.get('span', [])),
157
+ 'num_paragraphs': len(bn.get('p', [])),
158
+ 'num_headings': sum(len(bn.get(h, []))
159
+ for h in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6')),
160
+ 'num_lists': len(bn.get('ul', [])) + len(bn.get('ol', [])),
161
+ 'num_images': len(c['images']),
162
+ 'num_iframes': len(c['iframes']),
163
+ 'num_tables': len(bn.get('table', [])),
164
+ 'has_title': 1 if soup.find('title') else 0,
165
+ 'dom_depth': max_depth,
166
+ }
167
+
168
+ # ------------------------------------------------------------------
169
+ # 2. Form features (11)
170
+ # ------------------------------------------------------------------
171
+ @staticmethod
172
+ def _form_features(c: dict) -> dict:
173
+ forms = c['forms']
174
+ input_types = c['input_types']
175
+
176
+ n_password = sum(1 for _, t in input_types if t == 'password')
177
+ n_email = sum(1 for _, t in input_types if t == 'email')
178
+ n_text = sum(1 for _, t in input_types if t == 'text')
179
+ n_hidden = sum(1 for _, t in input_types if t == 'hidden')
180
+ n_submit = sum(1 for _, t in input_types if t == 'submit')
181
+ # Also count <button type="submit">
182
+ n_submit += sum(1 for btn in c['by_name'].get('button', [])
183
+ if (btn.get('type', '') or '').lower() == 'submit')
184
+
185
+ form_actions = [f.get('action', '') or '' for f in forms]
186
+ n_ext_action = sum(1 for a in form_actions if a.startswith('http'))
187
+ n_empty_action = sum(1 for a in form_actions if not a or a == '#')
188
+
189
+ return {
190
+ 'num_forms': len(forms),
191
+ 'num_input_fields': len(c['inputs']),
192
+ 'num_password_fields': n_password,
193
+ 'num_email_fields': n_email,
194
+ 'num_text_fields': n_text,
195
+ 'num_submit_buttons': n_submit,
196
+ 'num_hidden_fields': n_hidden,
197
+ 'has_login_form': 1 if (n_password > 0 and (n_email > 0 or n_text > 0)) else 0,
198
+ 'has_form': 1 if forms else 0,
199
+ 'num_external_form_actions': n_ext_action,
200
+ 'num_empty_form_actions': n_empty_action,
201
+ }
202
+
203
+ # ------------------------------------------------------------------
204
+ # 3. Link features (10)
205
+ # ------------------------------------------------------------------
206
+ @staticmethod
207
+ def _link_features(c: dict) -> dict:
208
+ hrefs = c['hrefs']
209
+ links_a = c['links_a']
210
+ n_links = len(links_a)
211
+
212
+ n_external = sum(1 for h in hrefs if h.startswith('http'))
213
+ n_internal = sum(1 for h in hrefs if h.startswith('/') or h.startswith('#'))
214
+ n_empty = sum(1 for h in hrefs if not h or h == '#')
215
+ n_mailto = sum(1 for h in hrefs if h.startswith('mailto:'))
216
+ n_js = sum(1 for h in hrefs if 'javascript:' in h.lower())
217
+ n_ip = sum(1 for h in hrefs
218
+ if re.search(r'https?://\d+\.\d+\.\d+\.\d+', h))
219
+
220
+ # Count links pointing to suspicious TLDs
221
+ n_suspicious_tld = 0
222
+ for h in hrefs:
223
+ if h.startswith('http'):
224
+ try:
225
+ netloc = urlparse(h).netloc.lower()
226
+ for tld in SUSPICIOUS_TLDS:
227
+ if netloc.endswith(tld):
228
+ n_suspicious_tld += 1
229
+ break
230
+ except Exception:
231
+ pass
232
+
233
+ ratio_ext = n_external / n_links if n_links > 0 else 0.0
234
+
235
+ return {
236
+ 'num_links': n_links,
237
+ 'num_external_links': n_external,
238
+ 'num_internal_links': n_internal,
239
+ 'num_empty_links': n_empty,
240
+ 'num_mailto_links': n_mailto,
241
+ 'num_javascript_links': n_js,
242
+ 'ratio_external_links': ratio_ext,
243
+ 'num_ip_based_links': n_ip,
244
+ 'num_suspicious_tld_links': n_suspicious_tld,
245
+ 'num_anchor_text_mismatch': HTMLFeatureExtractor._anchor_mismatch(links_a),
246
+ }
247
+
248
+ @staticmethod
249
+ def _anchor_mismatch(links_a: list) -> int:
250
+ """Count links where visible text shows a domain different from href."""
251
+ count = 0
252
+ url_pattern = re.compile(r'https?://[^\s<>"\']+')
253
+ for a in links_a:
254
+ href = a.get('href', '') or ''
255
+ text = a.get_text(strip=True)
256
+ if not href.startswith('http') or not text:
257
+ continue
258
+ text_urls = url_pattern.findall(text)
259
+ if text_urls:
260
+ try:
261
+ href_domain = urlparse(href).netloc.lower()
262
+ text_domain = urlparse(text_urls[0]).netloc.lower()
263
+ if href_domain and text_domain and href_domain != text_domain:
264
+ count += 1
265
+ except Exception:
266
+ pass
267
+ return count
268
+
269
+ # ------------------------------------------------------------------
270
+ # 4. Script features (7)
271
+ # ------------------------------------------------------------------
272
+ @staticmethod
273
+ def _script_features(c: dict) -> dict:
274
+ scripts = c['scripts']
275
+ n_inline = 0
276
+ n_external = 0
277
+ script_text_parts = []
278
+
279
+ for s in scripts:
280
+ if s.get('src'):
281
+ n_external += 1
282
+ if s.string:
283
+ n_inline += 1
284
+ script_text_parts.append(s.string)
285
+
286
+ script_content = ' '.join(script_text_parts)
287
+
288
+ return {
289
+ 'num_scripts': len(scripts),
290
+ 'num_inline_scripts': n_inline,
291
+ 'num_external_scripts': n_external,
292
+ 'has_eval': 1 if 'eval(' in script_content else 0,
293
+ 'has_unescape': 1 if 'unescape(' in script_content else 0,
294
+ 'has_escape': 1 if 'escape(' in script_content else 0,
295
+ 'has_document_write': 1 if 'document.write' in script_content else 0,
296
+ }
297
+
298
+ # ------------------------------------------------------------------
299
+ # 5. Text content features (8)
300
+ # ------------------------------------------------------------------
301
+ @staticmethod
302
+ def _text_features(soup, c: dict) -> dict:
303
+ text = soup.get_text(separator=' ', strip=True).lower()
304
+ words = text.split()
305
+ n_words = len(words)
306
+ html_len = len(str(soup))
307
+
308
+ return {
309
+ 'text_length': len(text),
310
+ 'num_words': n_words,
311
+ 'text_to_html_ratio': len(text) / html_len if html_len > 0 else 0.0,
312
+ 'num_brand_mentions': sum(1 for kw in BRAND_KEYWORDS if kw in text),
313
+ 'num_urgency_keywords': sum(1 for kw in URGENCY_KEYWORDS if kw in text),
314
+ 'has_copyright': 1 if ('©' in text or 'copyright' in text) else 0,
315
+ 'has_phone_number': 1 if re.search(
316
+ r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text) else 0,
317
+ 'has_email_address': 1 if re.search(
318
+ r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}', text) else 0,
319
+ }
320
+
321
+ # ------------------------------------------------------------------
322
+ # 6. Meta tag features (6)
323
+ # ------------------------------------------------------------------
324
+ @staticmethod
325
+ def _meta_features(soup, c: dict) -> dict:
326
+ meta_tags = c['meta_tags']
327
+
328
+ has_refresh = 0
329
+ has_desc = 0
330
+ has_keywords = 0
331
+ has_author = 0
332
+ has_viewport = 0
333
+ for m in meta_tags:
334
+ name_attr = (m.get('name') or '').lower()
335
+ http_equiv = (m.get('http-equiv') or '').lower()
336
+ if name_attr == 'description':
337
+ has_desc = 1
338
+ elif name_attr == 'keywords':
339
+ has_keywords = 1
340
+ elif name_attr == 'author':
341
+ has_author = 1
342
+ elif name_attr == 'viewport':
343
+ has_viewport = 1
344
+ if http_equiv == 'refresh':
345
+ has_refresh = 1
346
+
347
+ return {
348
+ 'num_meta_tags': len(meta_tags),
349
+ 'has_description': has_desc,
350
+ 'has_keywords': has_keywords,
351
+ 'has_author': has_author,
352
+ 'has_viewport': has_viewport,
353
+ 'has_meta_refresh': has_refresh,
354
+ }
355
+
356
+ # ------------------------------------------------------------------
357
+ # 7. Resource features (7)
358
+ # ------------------------------------------------------------------
359
+ @staticmethod
360
+ def _resource_features(c: dict) -> dict:
361
+ css_links = c['css_links']
362
+ images = c['images']
363
+ style_tags = c['style_tags']
364
+
365
+ img_srcs = [img.get('src', '') or '' for img in images]
366
+ css_content = ''.join(tag.string or '' for tag in style_tags)
367
+
368
+ has_favicon = 0
369
+ for lt in c['all_link_tags']:
370
+ rel = lt.get('rel', [])
371
+ if 'icon' in rel or 'shortcut' in rel:
372
+ has_favicon = 1
373
+ break
374
+
375
+ return {
376
+ 'num_css_files': len(css_links),
377
+ 'num_external_css': sum(1 for lk in css_links
378
+ if (lk.get('href', '') or '').startswith('http')),
379
+ 'num_external_images': sum(1 for s in img_srcs if s.startswith('http')),
380
+ 'num_data_uri_images': sum(1 for s in img_srcs if s.startswith('data:')),
381
+ 'num_inline_styles': len(style_tags),
382
+ 'inline_css_length': len(css_content),
383
+ 'has_favicon': has_favicon,
384
+ }
385
+
386
+ # ------------------------------------------------------------------
387
+ # 8. Advanced phishing indicators (16)
388
+ # ------------------------------------------------------------------
389
+ @staticmethod
390
+ def _advanced_features(soup, c: dict) -> dict:
391
+ forms = c['forms']
392
+ input_types = c['input_types']
393
+ hrefs = c['hrefs']
394
+ all_text_lower = str(soup).lower()
395
+
396
+ # Password + external action combo
397
+ has_password = any(t == 'password' for _, t in input_types)
398
+ has_ext_action = any(
399
+ (f.get('action', '') or '').startswith('http') for f in forms)
400
+
401
+ # Count unique external domains from links
402
+ ext_domains = set()
403
+ for h in hrefs:
404
+ if h.startswith('http'):
405
+ try:
406
+ d = urlparse(h).netloc
407
+ if d:
408
+ ext_domains.add(d.lower())
409
+ except Exception:
410
+ pass
411
+
412
+ # Forms without labels
413
+ n_forms_no_label = sum(
414
+ 1 for f in forms
415
+ if not f.find_all('label') and f.find_all('input')
416
+ )
417
+
418
+ # Event handlers – single pass over all tags
419
+ n_onload = 0
420
+ n_onerror = 0
421
+ n_onclick = 0
422
+ for tag in c['all_tags']:
423
+ attrs = tag.attrs
424
+ if 'onload' in attrs:
425
+ n_onload += 1
426
+ if 'onerror' in attrs:
427
+ n_onerror += 1
428
+ if 'onclick' in attrs:
429
+ n_onclick += 1
430
+
431
+ # Iframe with small/zero dimensions (common cloaking)
432
+ n_hidden_iframes = 0
433
+ for iframe in c['iframes']:
434
+ w = iframe.get('width', '')
435
+ h = iframe.get('height', '')
436
+ style = (iframe.get('style', '') or '').lower()
437
+ if w in ('0', '1') or h in ('0', '1') or 'display:none' in style or 'visibility:hidden' in style:
438
+ n_hidden_iframes += 1
439
+
440
+ return {
441
+ 'password_with_external_action': 1 if (has_password and has_ext_action) else 0,
442
+ 'has_base64': 1 if 'base64' in all_text_lower else 0,
443
+ 'has_atob': 1 if 'atob(' in all_text_lower else 0,
444
+ 'has_fromcharcode': 1 if 'fromcharcode' in all_text_lower else 0,
445
+ 'num_onload_events': n_onload,
446
+ 'num_onerror_events': n_onerror,
447
+ 'num_onclick_events': n_onclick,
448
+ 'num_unique_external_domains': len(ext_domains),
449
+ 'num_forms_without_labels': n_forms_no_label,
450
+ 'has_display_none': 1 if ('display:none' in all_text_lower or
451
+ 'display: none' in all_text_lower) else 0,
452
+ 'has_visibility_hidden': 1 if ('visibility:hidden' in all_text_lower or
453
+ 'visibility: hidden' in all_text_lower) else 0,
454
+ 'has_window_open': 1 if 'window.open' in all_text_lower else 0,
455
+ 'has_location_replace': 1 if ('location.replace' in all_text_lower or
456
+ 'location.href' in all_text_lower) else 0,
457
+ 'num_hidden_iframes': n_hidden_iframes,
458
+ 'has_right_click_disabled': 1 if ('oncontextmenu' in all_text_lower and
459
+ 'return false' in all_text_lower) else 0,
460
+ 'has_status_bar_customization': 1 if ('window.status' in all_text_lower or
461
+ 'onmouseover' in all_text_lower) else 0,
462
+ }
463
+
464
+ # ------------------------------------------------------------------
465
+ # Default features (all zeros) – used on parse failure
466
+ # ------------------------------------------------------------------
467
+ def _default_features(self) -> dict:
468
+ return {k: 0 for k in self.get_feature_names()}
469
+
470
+ @staticmethod
471
+ def get_feature_names() -> list[str]:
472
+ """Return ordered list of all 67 feature names."""
473
+ return [
474
+ # Structure (12)
475
+ 'html_length', 'num_tags', 'num_divs', 'num_spans',
476
+ 'num_paragraphs', 'num_headings', 'num_lists', 'num_images',
477
+ 'num_iframes', 'num_tables', 'has_title', 'dom_depth',
478
+ # Form (11)
479
+ 'num_forms', 'num_input_fields', 'num_password_fields',
480
+ 'num_email_fields', 'num_text_fields', 'num_submit_buttons',
481
+ 'num_hidden_fields', 'has_login_form', 'has_form',
482
+ 'num_external_form_actions', 'num_empty_form_actions',
483
+ # Link (10)
484
+ 'num_links', 'num_external_links', 'num_internal_links',
485
+ 'num_empty_links', 'num_mailto_links', 'num_javascript_links',
486
+ 'ratio_external_links', 'num_ip_based_links',
487
+ 'num_suspicious_tld_links', 'num_anchor_text_mismatch',
488
+ # Script (7)
489
+ 'num_scripts', 'num_inline_scripts', 'num_external_scripts',
490
+ 'has_eval', 'has_unescape', 'has_escape', 'has_document_write',
491
+ # Text (8)
492
+ 'text_length', 'num_words', 'text_to_html_ratio',
493
+ 'num_brand_mentions', 'num_urgency_keywords',
494
+ 'has_copyright', 'has_phone_number', 'has_email_address',
495
+ # Meta (6)
496
+ 'num_meta_tags', 'has_description', 'has_keywords',
497
+ 'has_author', 'has_viewport', 'has_meta_refresh',
498
+ # Resource (7)
499
+ 'num_css_files', 'num_external_css', 'num_external_images',
500
+ 'num_data_uri_images', 'num_inline_styles',
501
+ 'inline_css_length', 'has_favicon',
502
+ # Advanced (16)
503
+ 'password_with_external_action', 'has_base64', 'has_atob',
504
+ 'has_fromcharcode', 'num_onload_events', 'num_onerror_events',
505
+ 'num_onclick_events', 'num_unique_external_domains',
506
+ 'num_forms_without_labels', 'has_display_none',
507
+ 'has_visibility_hidden', 'has_window_open',
508
+ 'has_location_replace', 'num_hidden_iframes',
509
+ 'has_right_click_disabled', 'has_status_bar_customization',
510
+ ]
scripts/feature_extraction/html/v1/__pycache__/html_features.cpython-313.pyc ADDED
Binary file (21.8 kB). View file
 
scripts/feature_extraction/html/v1/extract_html_features_simple.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Extract HTML Features - Direct from Files (No Metadata Needed)
3
+ Simplified version that scans directories directly
4
+ WITH QUALITY FILTERING to remove low-quality HTML files
5
+ """
6
+ import pandas as pd
7
+ from pathlib import Path
8
+ import logging
9
+ from tqdm import tqdm
10
+ import sys
11
+ import re
12
+ from bs4 import BeautifulSoup
13
+
14
+ # Add scripts directory to path
15
+ sys.path.append(str(Path(__file__).parent))
16
+
17
+ from html_features import HTMLFeatureExtractor
18
+
19
+ # Setup logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s',
23
+ datefmt='%H:%M:%S'
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Quality filter constants
28
+ MIN_FILE_SIZE = 1000 # Minimum 1KB
29
+ MIN_WORDS = 50 # Minimum 50 words of text content
30
+ MIN_TAGS = 10 # Minimum 10 HTML tags
31
+ ERROR_PATTERNS = [
32
+ 'page not found', '404', '403', 'forbidden', 'access denied',
33
+ 'error occurred', 'server error', 'not available', 'suspended',
34
+ 'domain for sale', 'this site can', 'website expired',
35
+ 'coming soon', 'under construction', 'parked domain',
36
+ 'buy this domain', 'this domain', 'domain has expired'
37
+ ]
38
+
39
+
40
+ def is_quality_html(html_content, filename=""):
41
+ """
42
+ Check if HTML file meets quality criteria.
43
+
44
+ Returns:
45
+ tuple: (is_valid, reason)
46
+ """
47
+ # Check 1: Minimum file size
48
+ if len(html_content) < MIN_FILE_SIZE:
49
+ return False, f"Too small ({len(html_content)} bytes)"
50
+
51
+ try:
52
+ soup = BeautifulSoup(html_content, 'html.parser')
53
+
54
+ # Check 2: Has body tag (basic HTML structure)
55
+ if not soup.find('body'):
56
+ return False, "No body tag"
57
+
58
+ # Check 3: Minimum number of tags
59
+ num_tags = len(soup.find_all())
60
+ if num_tags < MIN_TAGS:
61
+ return False, f"Too few tags ({num_tags})"
62
+
63
+ # Check 4: Get text content and check word count
64
+ text = soup.get_text(separator=' ', strip=True).lower()
65
+ words = text.split()
66
+ if len(words) < MIN_WORDS:
67
+ return False, f"Too few words ({len(words)})"
68
+
69
+ # Check 5: Not an error page
70
+ text_lower = text[:2000] # Check first 2000 chars
71
+ for pattern in ERROR_PATTERNS:
72
+ if pattern in text_lower:
73
+ return False, f"Error page pattern: '{pattern}'"
74
+
75
+ # Check 6: Has some interactive elements OR substantial content
76
+ has_links = len(soup.find_all('a')) > 0
77
+ has_forms = len(soup.find_all('form')) > 0
78
+ has_inputs = len(soup.find_all('input')) > 0
79
+ has_images = len(soup.find_all('img')) > 0
80
+ has_divs = len(soup.find_all('div')) > 3
81
+
82
+ if not (has_links or has_forms or has_inputs or has_images or has_divs):
83
+ return False, "No interactive elements"
84
+
85
+ # Check 7: Not mostly JavaScript (JS-only pages are hard to analyze)
86
+ script_content = ''.join([s.string or '' for s in soup.find_all('script')])
87
+ if len(script_content) > len(text) * 3 and len(text) < 200:
88
+ return False, "Mostly JavaScript, little content"
89
+
90
+ return True, "OK"
91
+
92
+ except Exception as e:
93
+ return False, f"Parse error: {str(e)[:50]}"
94
+
95
+
96
+ def extract_features_from_directory(html_dir, label, limit=None, apply_filter=True):
97
+ """
98
+ Extract features from all HTML files in a directory.
99
+
100
+ Args:
101
+ html_dir: Directory containing HTML files
102
+ label: Label for these files (0=legitimate, 1=phishing)
103
+ limit: Maximum number of files to process (None = all)
104
+ apply_filter: Apply quality filter to remove bad HTML files
105
+
106
+ Returns:
107
+ List of feature dictionaries
108
+ """
109
+ html_dir = Path(html_dir)
110
+ logger.info(f"\nProcessing: {html_dir}")
111
+ logger.info(f" Label: {'Phishing' if label == 1 else 'Legitimate'}")
112
+ logger.info(f" Quality filter: {'ENABLED' if apply_filter else 'DISABLED'}")
113
+
114
+ # Get all HTML files
115
+ html_files = sorted(html_dir.glob('*.html'))
116
+ total_files = len(html_files)
117
+ logger.info(f" Found {total_files:,} HTML files")
118
+
119
+ # Initialize extractor
120
+ extractor = HTMLFeatureExtractor()
121
+
122
+ results = []
123
+ errors = 0
124
+ filtered_out = 0
125
+ filter_reasons = {}
126
+
127
+ # Process each HTML file
128
+ for html_path in tqdm(html_files,
129
+ desc=f"Extracting {'Phishing' if label == 1 else 'Legitimate'} features"):
130
+ try:
131
+ # Read HTML content
132
+ with open(html_path, 'r', encoding='utf-8', errors='ignore') as f:
133
+ html_content = f.read()
134
+
135
+ # Apply quality filter if enabled
136
+ if apply_filter:
137
+ is_valid, reason = is_quality_html(html_content, html_path.name)
138
+ if not is_valid:
139
+ filtered_out += 1
140
+ filter_reasons[reason] = filter_reasons.get(reason, 0) + 1
141
+ continue
142
+
143
+ # Extract features
144
+ features = extractor.extract_features(html_content, url=None)
145
+
146
+ # Add metadata
147
+ features['filename'] = html_path.name # type: ignore
148
+ features['label'] = label
149
+
150
+ results.append(features)
151
+
152
+ # Check if we reached the limit
153
+ if limit and len(results) >= limit:
154
+ logger.info(f" Reached limit of {limit:,} quality files")
155
+ break
156
+
157
+ except Exception as e:
158
+ errors += 1
159
+ if errors < 10: # Show first 10 errors
160
+ logger.warning(f" Error processing {html_path.name}: {e}")
161
+
162
+ logger.info(f" Quality files extracted: {len(results):,}")
163
+ logger.info(f" Filtered out (low quality): {filtered_out:,} ({filtered_out/total_files*100:.1f}%)")
164
+
165
+ if filter_reasons and apply_filter:
166
+ logger.info(f" Filter reasons (top 5):")
167
+ for reason, count in sorted(filter_reasons.items(), key=lambda x: -x[1])[:5]:
168
+ logger.info(f" - {reason}: {count:,}")
169
+
170
+ if errors > 0:
171
+ logger.warning(f" Errors: {errors:,}")
172
+
173
+ return results
174
+
175
+
176
+ def main():
177
+ logger.info("="*80)
178
+ logger.info("BALANCED HTML FEATURES EXTRACTION (WITH QUALITY FILTER)")
179
+ logger.info("="*80)
180
+
181
+ # Quality filter info
182
+ logger.info("\nQuality Filter Criteria:")
183
+ logger.info(f" - Minimum file size: {MIN_FILE_SIZE} bytes")
184
+ logger.info(f" - Minimum word count: {MIN_WORDS} words")
185
+ logger.info(f" - Minimum HTML tags: {MIN_TAGS}")
186
+ logger.info(f" - Must have body tag")
187
+ logger.info(f" - Not an error/parked page")
188
+ logger.info(f" - Has interactive elements (links/forms/images)")
189
+
190
+ # Paths
191
+ phishing_html_dir = Path('data/html/phishing_v1')
192
+ legit_html_dir = Path('data/html/legitimate_v1')
193
+ output_path = Path('data/features/html_features_old.csv')
194
+
195
+ # Check directories exist
196
+ if not phishing_html_dir.exists():
197
+ logger.error(f"Phishing directory not found: {phishing_html_dir}")
198
+ return
199
+
200
+ if not legit_html_dir.exists():
201
+ logger.error(f"Legitimate directory not found: {legit_html_dir}")
202
+ return
203
+
204
+ # Count files
205
+ logger.info("\n1. Checking available HTML files...")
206
+ phishing_files = list(phishing_html_dir.glob('*.html'))
207
+ legit_files = list(legit_html_dir.glob('*.html'))
208
+
209
+ phishing_count = len(phishing_files)
210
+ legit_count = len(legit_files)
211
+
212
+ logger.info(f" Phishing HTML files: {phishing_count:,}")
213
+ logger.info(f" Legitimate HTML files: {legit_count:,}")
214
+
215
+ # Extract phishing features (with quality filter)
216
+ logger.info("\n2. Extracting PHISHING HTML features (with quality filter)...")
217
+ phishing_features = extract_features_from_directory(
218
+ phishing_html_dir,
219
+ label=1, # Phishing
220
+ limit=None, # Get all quality files first
221
+ apply_filter=True
222
+ )
223
+
224
+ # Extract legitimate features (with quality filter)
225
+ logger.info("\n3. Extracting LEGITIMATE HTML features (with quality filter)...")
226
+ legit_features = extract_features_from_directory(
227
+ legit_html_dir,
228
+ label=0, # Legitimate
229
+ limit=None, # Get all quality files first
230
+ apply_filter=True
231
+ )
232
+
233
+ # Balance the dataset
234
+ logger.info("\n4. Balancing dataset...")
235
+ min_count = min(len(phishing_features), len(legit_features))
236
+ logger.info(f" Quality phishing samples: {len(phishing_features):,}")
237
+ logger.info(f" Quality legitimate samples: {len(legit_features):,}")
238
+ logger.info(f" Balancing to: {min_count:,} per class")
239
+
240
+ # Truncate to balanced size
241
+ phishing_features = phishing_features[:min_count]
242
+ legit_features = legit_features[:min_count]
243
+
244
+ # Combine results
245
+ logger.info("\n5. Combining datasets...")
246
+ all_features = phishing_features + legit_features
247
+
248
+ if len(all_features) == 0:
249
+ logger.error("No features extracted! Check error messages above.")
250
+ return
251
+
252
+ # Create DataFrame
253
+ logger.info("\n6. Creating features DataFrame...")
254
+ features_df = pd.DataFrame(all_features)
255
+
256
+ # Reorder columns (filename and label first, then features)
257
+ feature_cols = [col for col in features_df.columns if col not in ['filename', 'label']]
258
+ features_df = features_df[['filename', 'label'] + feature_cols]
259
+
260
+ # Shuffle dataset
261
+ features_df = features_df.sample(frac=1, random_state=42).reset_index(drop=True)
262
+
263
+ logger.info(f" Shape: {features_df.shape}")
264
+ logger.info(f" Features: {len(feature_cols)}")
265
+
266
+ # Show label distribution
267
+ logger.info(f"\n Label distribution:")
268
+ label_counts = features_df['label'].value_counts()
269
+ for label, count in label_counts.items():
270
+ label_name = 'Phishing' if label == 1 else 'Legitimate'
271
+ logger.info(f" {label_name}: {count:,} ({count/len(features_df)*100:.1f}%)")
272
+
273
+ # Save to CSV
274
+ logger.info(f"\n7. Saving features to: {output_path}")
275
+ output_path.parent.mkdir(parents=True, exist_ok=True)
276
+ features_df.to_csv(output_path, index=False)
277
+ logger.info(f" ✓ Saved!")
278
+
279
+ # Show statistics
280
+ logger.info("\n" + "="*80)
281
+ logger.info("EXTRACTION SUMMARY")
282
+ logger.info("="*80)
283
+ logger.info(f"\nTotal samples: {len(features_df):,}")
284
+ logger.info(f" Phishing: {len(phishing_features):,}")
285
+ logger.info(f" Legitimate: {len(legit_features):,}")
286
+ logger.info(f"\nFeatures extracted: {len(feature_cols)}")
287
+ logger.info(f"Dataset balance: {(label_counts[0]/label_counts[1])*100:.1f}%")
288
+
289
+ # Show sample statistics
290
+ logger.info(f"\nFeature statistics (first 10 features):")
291
+ numeric_cols = features_df.select_dtypes(include=['int64', 'float64']).columns[:10]
292
+ stats = features_df[numeric_cols].describe()
293
+ logger.info(f"\n{stats.to_string()}")
294
+
295
+ logger.info("\n" + "="*80)
296
+ logger.info("✓ QUALITY-FILTERED HTML FEATURES EXTRACTION COMPLETE!")
297
+ logger.info("="*80)
298
+ logger.info(f"\nOutput file: {output_path}")
299
+ logger.info(f"Shape: {features_df.shape}")
300
+ logger.info(f"Quality filter removed low-quality HTML files")
301
+ logger.info("="*80)
302
+
303
+
304
+ if __name__ == '__main__':
305
+ main()
scripts/feature_extraction/html/v1/html_features.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Feature Extractor for Phishing Detection
3
+ Extracts ~50 features from HTML content including forms, links, scripts, etc.
4
+ """
5
+ import re
6
+ from pathlib import Path
7
+ from bs4 import BeautifulSoup
8
+ from urllib.parse import urlparse
9
+ import pandas as pd
10
+ import numpy as np
11
+
12
+
13
+ class HTMLFeatureExtractor:
14
+ """Extract features from HTML content for phishing detection."""
15
+
16
+ def __init__(self):
17
+ # Common legitimate brand keywords
18
+ self.brand_keywords = [
19
+ 'paypal', 'amazon', 'google', 'microsoft', 'apple', 'facebook',
20
+ 'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'yahoo',
21
+ 'bank', 'visa', 'mastercard', 'americanexpress', 'chase', 'wells',
22
+ 'citibank', 'dhl', 'fedex', 'ups', 'usps'
23
+ ]
24
+
25
+ # Urgency/phishing keywords
26
+ self.urgency_keywords = [
27
+ 'urgent', 'verify', 'account', 'suspended', 'locked', 'confirm',
28
+ 'update', 'security', 'alert', 'warning', 'expire', 'limited',
29
+ 'immediately', 'click here', 'act now', 'suspended', 'unusual',
30
+ 'unauthorized', 'restricted'
31
+ ]
32
+
33
+ def extract_features(self, html_content, url=None):
34
+ """
35
+ Extract all HTML features from content.
36
+
37
+ Args:
38
+ html_content: HTML string content
39
+ url: Optional URL for additional context
40
+
41
+ Returns:
42
+ Dictionary of features
43
+ """
44
+ features = {}
45
+
46
+ try:
47
+ soup = BeautifulSoup(html_content, 'html.parser')
48
+
49
+ # Basic structure features
50
+ features.update(self._extract_structure_features(soup))
51
+
52
+ # Form features
53
+ features.update(self._extract_form_features(soup))
54
+
55
+ # Link features
56
+ features.update(self._extract_link_features(soup, url))
57
+
58
+ # Script features
59
+ features.update(self._extract_script_features(soup))
60
+
61
+ # Text content features
62
+ features.update(self._extract_text_features(soup))
63
+
64
+ # Meta tag features
65
+ features.update(self._extract_meta_features(soup))
66
+
67
+ # External resource features
68
+ features.update(self._extract_resource_features(soup, url))
69
+
70
+ # Advanced phishing indicators
71
+ features.update(self._extract_advanced_features(soup))
72
+
73
+ except Exception as e:
74
+ print(f"Error extracting features: {e}")
75
+ # Return default features on error
76
+ features = self._get_default_features()
77
+
78
+ return features
79
+
80
+ def _extract_structure_features(self, soup):
81
+ """Extract basic HTML structure features."""
82
+ return {
83
+ 'html_length': len(str(soup)),
84
+ 'num_tags': len(soup.find_all()),
85
+ 'num_divs': len(soup.find_all('div')),
86
+ 'num_spans': len(soup.find_all('span')),
87
+ 'num_paragraphs': len(soup.find_all('p')),
88
+ 'num_headings': len(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])),
89
+ 'num_lists': len(soup.find_all(['ul', 'ol'])),
90
+ 'num_images': len(soup.find_all('img')),
91
+ 'num_iframes': len(soup.find_all('iframe')),
92
+ 'num_tables': len(soup.find_all('table')),
93
+ 'has_title': 1 if soup.find('title') else 0,
94
+ }
95
+
96
+ def _extract_form_features(self, soup):
97
+ """Extract form-related features."""
98
+ forms = soup.find_all('form')
99
+
100
+ features = {
101
+ 'num_forms': len(forms),
102
+ 'num_input_fields': len(soup.find_all('input')),
103
+ 'num_password_fields': len(soup.find_all('input', {'type': 'password'})),
104
+ 'num_email_fields': len(soup.find_all('input', {'type': 'email'})),
105
+ 'num_text_fields': len(soup.find_all('input', {'type': 'text'})),
106
+ 'num_submit_buttons': len(soup.find_all(['input', 'button'], {'type': 'submit'})),
107
+ 'num_hidden_fields': len(soup.find_all('input', {'type': 'hidden'})),
108
+ 'has_form': 1 if forms else 0,
109
+ }
110
+
111
+ # Check form actions
112
+ if forms:
113
+ form_actions = [form.get('action', '') for form in forms]
114
+ features['num_external_form_actions'] = sum(1 for action in form_actions
115
+ if action.startswith('http'))
116
+ features['num_empty_form_actions'] = sum(1 for action in form_actions
117
+ if not action or action == '#')
118
+ else:
119
+ features['num_external_form_actions'] = 0
120
+ features['num_empty_form_actions'] = 0
121
+
122
+ return features
123
+
124
+ def _extract_link_features(self, soup, url=None):
125
+ """Extract link-related features."""
126
+ links = soup.find_all('a')
127
+ hrefs = [link.get('href', '') for link in links]
128
+
129
+ features = {
130
+ 'num_links': len(links),
131
+ 'num_external_links': sum(1 for href in hrefs if href.startswith('http')),
132
+ 'num_internal_links': sum(1 for href in hrefs if href.startswith('/') or href.startswith('#')),
133
+ 'num_empty_links': sum(1 for href in hrefs if not href or href == '#'),
134
+ 'num_mailto_links': sum(1 for href in hrefs if href.startswith('mailto:')),
135
+ 'num_javascript_links': sum(1 for href in hrefs if 'javascript:' in href.lower()),
136
+ }
137
+
138
+ # Calculate ratio of external links
139
+ if features['num_links'] > 0:
140
+ features['ratio_external_links'] = features['num_external_links'] / features['num_links'] # type: ignore
141
+ else:
142
+ features['ratio_external_links'] = 0
143
+
144
+ # Check for suspicious link patterns
145
+ features['num_ip_based_links'] = sum(1 for href in hrefs
146
+ if re.search(r'http://\d+\.\d+\.\d+\.\d+', href))
147
+
148
+ return features
149
+
150
+ def _extract_script_features(self, soup):
151
+ """Extract JavaScript/script features."""
152
+ scripts = soup.find_all('script')
153
+
154
+ features = {
155
+ 'num_scripts': len(scripts),
156
+ 'num_inline_scripts': sum(1 for script in scripts if script.string),
157
+ 'num_external_scripts': sum(1 for script in scripts if script.get('src')),
158
+ }
159
+
160
+ # Check for suspicious script patterns
161
+ script_content = ' '.join([script.string for script in scripts if script.string])
162
+ features['has_eval'] = 1 if 'eval(' in script_content else 0
163
+ features['has_unescape'] = 1 if 'unescape(' in script_content else 0
164
+ features['has_escape'] = 1 if 'escape(' in script_content else 0
165
+ features['has_document_write'] = 1 if 'document.write' in script_content else 0
166
+
167
+ return features
168
+
169
+ def _extract_text_features(self, soup):
170
+ """Extract text content features."""
171
+ # Get all visible text
172
+ text = soup.get_text(separator=' ', strip=True).lower()
173
+
174
+ features = {
175
+ 'text_length': len(text),
176
+ 'num_words': len(text.split()),
177
+ }
178
+
179
+ # Check for brand mentions
180
+ features['num_brand_mentions'] = sum(1 for brand in self.brand_keywords
181
+ if brand in text)
182
+
183
+ # Check for urgency keywords
184
+ features['num_urgency_keywords'] = sum(1 for keyword in self.urgency_keywords
185
+ if keyword in text)
186
+
187
+ # Check for specific patterns
188
+ features['has_copyright'] = 1 if '©' in text or 'copyright' in text else 0
189
+ features['has_phone_number'] = 1 if re.search(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text) else 0
190
+ features['has_email'] = 1 if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) else 0
191
+
192
+ return features
193
+
194
+ def _extract_meta_features(self, soup):
195
+ """Extract meta tag features."""
196
+ meta_tags = soup.find_all('meta')
197
+
198
+ features = {
199
+ 'num_meta_tags': len(meta_tags),
200
+ 'has_description': 1 if soup.find('meta', {'name': 'description'}) else 0,
201
+ 'has_keywords': 1 if soup.find('meta', {'name': 'keywords'}) else 0,
202
+ 'has_author': 1 if soup.find('meta', {'name': 'author'}) else 0,
203
+ 'has_viewport': 1 if soup.find('meta', {'name': 'viewport'}) else 0,
204
+ }
205
+
206
+ # Check for refresh meta tag (often used in phishing)
207
+ refresh_meta = soup.find('meta', {'http-equiv': 'refresh'})
208
+ features['has_meta_refresh'] = 1 if refresh_meta else 0
209
+
210
+ return features
211
+
212
+ def _extract_resource_features(self, soup, url=None):
213
+ """Extract external resource features."""
214
+ # CSS links
215
+ css_links = soup.find_all('link', {'rel': 'stylesheet'})
216
+
217
+ # Images
218
+ images = soup.find_all('img')
219
+ img_srcs = [img.get('src', '') for img in images]
220
+
221
+ # Inline styles
222
+ inline_style_tags = soup.find_all('style')
223
+ inline_style_content = ''.join([tag.string or '' for tag in inline_style_tags])
224
+
225
+ features = {
226
+ 'num_css_files': len(css_links),
227
+ 'num_external_css': sum(1 for link in css_links
228
+ if link.get('href', '').startswith('http')),
229
+ 'num_external_images': sum(1 for src in img_srcs if src.startswith('http')),
230
+ 'num_data_uri_images': sum(1 for src in img_srcs if src.startswith('data:')),
231
+ 'num_inline_styles': len(inline_style_tags),
232
+ 'inline_css_length': len(inline_style_content),
233
+ 'has_favicon': 1 if soup.find('link', {'rel': 'icon'}) or soup.find('link', {'rel': 'shortcut icon'}) else 0,
234
+ }
235
+
236
+ return features
237
+
238
+ def _extract_advanced_features(self, soup):
239
+ """Extract advanced phishing indicators."""
240
+ features = {}
241
+
242
+ # Suspicious element combinations
243
+ has_password = len(soup.find_all('input', {'type': 'password'})) > 0
244
+ has_external_action = any(
245
+ form.get('action', '').startswith('http')
246
+ for form in soup.find_all('form')
247
+ )
248
+ features['password_with_external_action'] = 1 if (has_password and has_external_action) else 0
249
+
250
+ # Obfuscation indicators
251
+ all_text = str(soup).lower()
252
+ features['has_base64'] = 1 if 'base64' in all_text else 0
253
+ features['has_atob'] = 1 if 'atob(' in all_text else 0
254
+ features['has_fromcharcode'] = 1 if 'fromcharcode' in all_text else 0
255
+
256
+ # Suspicious attributes
257
+ features['num_onload_events'] = len(soup.find_all(attrs={'onload': True}))
258
+ features['num_onerror_events'] = len(soup.find_all(attrs={'onerror': True}))
259
+ features['num_onclick_events'] = len(soup.find_all(attrs={'onclick': True}))
260
+
261
+ # Domain analysis from links
262
+ external_domains = set()
263
+ for link in soup.find_all('a', href=True):
264
+ href = link['href']
265
+ if href.startswith('http'):
266
+ try:
267
+ domain = urlparse(href).netloc
268
+ if domain:
269
+ external_domains.add(domain)
270
+ except:
271
+ pass
272
+ features['num_unique_external_domains'] = len(external_domains)
273
+
274
+ # Suspicious patterns in forms
275
+ forms = soup.find_all('form')
276
+ features['num_forms_without_labels'] = sum(
277
+ 1 for form in forms
278
+ if len(form.find_all('label')) == 0 and len(form.find_all('input')) > 0
279
+ )
280
+
281
+ # CSS visibility hiding (phishing technique)
282
+ features['has_display_none'] = 1 if 'display:none' in all_text or 'display: none' in all_text else 0
283
+ features['has_visibility_hidden'] = 1 if 'visibility:hidden' in all_text or 'visibility: hidden' in all_text else 0
284
+
285
+ # Popup/redirect indicators
286
+ features['has_window_open'] = 1 if 'window.open' in all_text else 0
287
+ features['has_location_replace'] = 1 if 'location.replace' in all_text or 'location.href' in all_text else 0
288
+
289
+ return features
290
+
291
+ def _get_default_features(self):
292
+ """Return dictionary with all features set to 0."""
293
+ return {
294
+ # Structure features (11)
295
+ 'html_length': 0, 'num_tags': 0, 'num_divs': 0, 'num_spans': 0,
296
+ 'num_paragraphs': 0, 'num_headings': 0, 'num_lists': 0, 'num_images': 0,
297
+ 'num_iframes': 0, 'num_tables': 0, 'has_title': 0,
298
+
299
+ # Form features (8)
300
+ 'num_forms': 0, 'num_input_fields': 0, 'num_password_fields': 0,
301
+ 'num_email_fields': 0, 'num_text_fields': 0, 'num_submit_buttons': 0,
302
+ 'num_hidden_fields': 0, 'has_form': 0, 'num_external_form_actions': 0,
303
+ 'num_empty_form_actions': 0,
304
+
305
+ # Link features (8)
306
+ 'num_links': 0, 'num_external_links': 0, 'num_internal_links': 0,
307
+ 'num_empty_links': 0, 'num_mailto_links': 0, 'num_javascript_links': 0,
308
+ 'ratio_external_links': 0, 'num_ip_based_links': 0,
309
+
310
+ # Script features (7)
311
+ 'num_scripts': 0, 'num_inline_scripts': 0, 'num_external_scripts': 0,
312
+ 'has_eval': 0, 'has_unescape': 0, 'has_escape': 0, 'has_document_write': 0,
313
+
314
+ # Text features (7)
315
+ 'text_length': 0, 'num_words': 0, 'num_brand_mentions': 0,
316
+ 'num_urgency_keywords': 0, 'has_copyright': 0, 'has_phone_number': 0,
317
+ 'has_email': 0,
318
+
319
+ # Meta features (6)
320
+ 'num_meta_tags': 0, 'num_css_files': 0, 'num_external_css': 0, 'num_external_images': 0,
321
+ 'num_data_uri_images': 0, 'num_inline_styles': 0, 'inline_css_length': 0,
322
+ 'has_favicon': 0,
323
+
324
+ # Advanced phishing indicators (13)
325
+ 'password_with_external_action': 0, 'has_base64': 0, 'has_atob': 0,
326
+ 'has_fromcharcode': 0, 'num_onload_events': 0, 'num_onerror_events': 0,
327
+ 'num_onclick_events': 0, 'num_unique_external_domains': 0,
328
+ 'num_forms_without_labels': 0, 'has_display_none': 0,
329
+ 'has_visibility_hidden': 0, 'has_window_open': 0, 'has_location_replace': 0,
330
+
331
+ # Resource features (4)
332
+ 'num_css_files': 0, 'num_external_css': 0, 'num_external_images': 0,
333
+ 'num_data_uri_images': 0,
334
+ }
335
+
336
+ def get_feature_names(self):
337
+ """Return list of all feature names."""
338
+ return list(self._get_default_features().keys())
339
+
340
+
341
+ def extract_features_from_file(html_file_path, url=None):
342
+ """
343
+ Extract features from a single HTML file.
344
+
345
+ Args:
346
+ html_file_path: Path to HTML file
347
+ url: Optional URL for context
348
+
349
+ Returns:
350
+ Dictionary of features
351
+ """
352
+ extractor = HTMLFeatureExtractor()
353
+
354
+ try:
355
+ with open(html_file_path, 'r', encoding='utf-8', errors='ignore') as f:
356
+ html_content = f.read()
357
+
358
+ return extractor.extract_features(html_content, url)
359
+ except Exception as e:
360
+ print(f"Error reading file {html_file_path}: {e}")
361
+ return extractor._get_default_features()
362
+
363
+
364
+ if __name__ == '__main__':
365
+ # Test with a sample HTML file
366
+ import sys
367
+
368
+ if len(sys.argv) > 1:
369
+ html_file = sys.argv[1]
370
+ features = extract_features_from_file(html_file)
371
+
372
+ print(f"\nExtracted {len(features)} features from {html_file}:")
373
+ print("-" * 80)
374
+ for feature, value in features.items():
375
+ print(f"{feature:30s}: {value}")
376
+ else:
377
+ print("Usage: python html_features.py <html_file_path>")
378
+ print("\nAvailable features:")
379
+ extractor = HTMLFeatureExtractor()
380
+ for i, feature in enumerate(extractor.get_feature_names(), 1):
381
+ print(f"{i:2d}. {feature}")
382
+ print(f"\nTotal: {len(extractor.get_feature_names())} features")
scripts/feature_extraction/url/__pycache__/url_features_v3.cpython-313.pyc ADDED
Binary file (50.9 kB). View file
 
scripts/feature_extraction/url/url_features_diagnostic.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from collections import Counter
3
+ from urllib.parse import urlparse
4
+
5
+ # Load data
6
+ df = pd.read_csv('data/features/url_features.csv')
7
+ phish_df = df[df['label'] == 1].copy() # Assuming 1 = phishing
8
+ legit_df = df[df['label'] == 0].copy() # Assuming 0 = legitimate
9
+
10
+ print("=== FREE PLATFORM DETECTION ANALYSIS ===\n")
11
+
12
+ # 1. Check detection rate
13
+ print(f"Total phishing: {len(phish_df)}")
14
+ print(f"Phishing on free platforms: {phish_df['is_free_platform'].sum()} ({phish_df['is_free_platform'].mean()*100:.1f}%)")
15
+ print(f"\nTotal legitimate: {len(legit_df)}")
16
+ print(f"Legitimate on free platforms: {legit_df['is_free_platform'].sum()} ({legit_df['is_free_platform'].mean()*100:.1f}%)")
17
+
18
+ # 2. Load original URLs
19
+ urls_df = pd.read_csv('data/processed/clean_dataset.csv')
20
+ phish_urls = urls_df[urls_df['label'] == 1]['url'].tolist() # Adjust column names
21
+ legit_urls = urls_df[urls_df['label'] == 0]['url'].tolist()
22
+
23
+ # 3. Extract domains from phishing URLs
24
+ def extract_domain(url):
25
+ try:
26
+ parsed = urlparse(url if url.startswith('http') else 'http://' + url)
27
+ return parsed.netloc.lower()
28
+ except:
29
+ return ''
30
+
31
+ phish_domains = [extract_domain(url) for url in phish_urls]
32
+
33
+ # 4. Find common domain patterns
34
+ print("\n=== TOP 50 PHISHING DOMAINS (by frequency) ===")
35
+ domain_counts = Counter(phish_domains)
36
+ for domain, count in domain_counts.most_common(50):
37
+ print(f"{domain:50s}: {count:5d}")
38
+
39
+ # 5. Find common suffixes (platforms)
40
+ print("\n=== COMMON DOMAIN SUFFIXES (platforms) ===")
41
+ suffixes = []
42
+ for domain in phish_domains:
43
+ parts = domain.split('.')
44
+ if len(parts) >= 2:
45
+ suffix = '.'.join(parts[-2:]) # Last 2 parts (e.g., weebly.com)
46
+ suffixes.append(suffix)
47
+
48
+ suffix_counts = Counter(suffixes)
49
+ print("\nTop 30 suffixes:")
50
+ for suffix, count in suffix_counts.most_common(30):
51
+ print(f"{suffix:30s}: {count:5d} ({count/len(phish_domains)*100:.1f}%)")
scripts/feature_extraction/url/url_features_v1.py ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Feature Extraction v1 - URL-Only Features for Stage 1 Model
3
+
4
+ This extractor focuses ONLY on URL structure and lexical features.
5
+ NO HTTP requests, NO external services, NO HTML parsing.
6
+
7
+ Features:
8
+ - Lexical (length, characters, entropy)
9
+ - Structural (domain parts, path segments, TLD)
10
+ - Statistical (entropy, n-grams, patterns)
11
+ - Security indicators (from URL only)
12
+ - Brand/phishing patterns
13
+
14
+ Designed for:
15
+ - Fast inference (< 1ms per URL)
16
+ - No network dependencies
17
+ - Production deployment
18
+ """
19
+
20
+ import pandas as pd
21
+ import numpy as np
22
+ from urllib.parse import urlparse, parse_qs, unquote
23
+ import re
24
+ import math
25
+ import socket
26
+ from pathlib import Path
27
+ from collections import Counter
28
+ import sys
29
+ import logging
30
+
31
+ # Setup logging
32
+ logging.basicConfig(
33
+ level=logging.INFO,
34
+ format='%(asctime)s - %(levelname)s - %(message)s',
35
+ datefmt='%H:%M:%S'
36
+ )
37
+ logger = logging.getLogger("url_features_v2")
38
+
39
+
40
+ class URLFeatureExtractorV2:
41
+ """
42
+ Fast URL-only feature extractor for Stage 1 phishing detection.
43
+
44
+ No HTTP requests, no external API calls - pure URL analysis.
45
+ """
46
+
47
+ def __init__(self):
48
+ """Initialize feature extractor with keyword lists."""
49
+
50
+ # Phishing-related keywords
51
+ self.phishing_keywords = [
52
+ 'login', 'signin', 'sign-in', 'log-in', 'logon', 'signon',
53
+ 'account', 'accounts', 'update', 'verify', 'verification',
54
+ 'secure', 'security', 'banking', 'bank', 'confirm', 'password',
55
+ 'passwd', 'credential', 'suspended', 'locked', 'unusual',
56
+ 'authenticate', 'auth', 'wallet', 'invoice', 'payment',
57
+ 'billing', 'expire', 'expired', 'limited', 'restrict',
58
+ 'urgent', 'immediately', 'alert', 'warning', 'resolve',
59
+ 'recover', 'restore', 'reactivate', 'unlock', 'validate'
60
+ ]
61
+
62
+ # Brand names commonly targeted
63
+ self.brand_names = [
64
+ 'paypal', 'ebay', 'amazon', 'apple', 'microsoft', 'google',
65
+ 'facebook', 'instagram', 'twitter', 'netflix', 'linkedin',
66
+ 'dropbox', 'chase', 'wellsfargo', 'bankofamerica', 'citibank',
67
+ 'americanexpress', 'amex', 'visa', 'mastercard', 'outlook',
68
+ 'office365', 'office', 'yahoo', 'aol', 'icloud', 'adobe',
69
+ 'spotify', 'steam', 'dhl', 'fedex', 'ups', 'usps',
70
+ 'coinbase', 'binance', 'blockchain', 'metamask', 'whatsapp',
71
+ 'telegram', 'discord', 'zoom', 'docusign', 'wetransfer',
72
+ 'hsbc', 'barclays', 'santander', 'ing', 'revolut'
73
+ ]
74
+
75
+ # URL shorteners
76
+ self.shorteners = [
77
+ 'bit.ly', 'bitly.com', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly',
78
+ 'is.gd', 'buff.ly', 'adf.ly', 'bit.do', 'short.to', 'tiny.cc',
79
+ 'j.mp', 'surl.li', 'rb.gy', 'cutt.ly', 'qrco.de', 'v.gd',
80
+ 'shorturl.at', 'rebrand.ly', 'clck.ru', 's.id', 'shrtco.de'
81
+ ]
82
+
83
+ # Suspicious TLDs
84
+ self.suspicious_tlds = {
85
+ 'tk', 'ml', 'ga', 'cf', 'gq', # Free domains
86
+ 'xyz', 'top', 'club', 'work', 'date', 'racing', 'win',
87
+ 'loan', 'download', 'stream', 'click', 'link', 'bid',
88
+ 'review', 'party', 'trade', 'webcam', 'science',
89
+ 'accountant', 'faith', 'cricket', 'zip', 'mov'
90
+ }
91
+
92
+ # Trusted TLDs
93
+ self.trusted_tlds = {
94
+ 'com', 'org', 'net', 'edu', 'gov', 'mil',
95
+ 'uk', 'us', 'ca', 'de', 'fr', 'jp', 'au',
96
+ 'nl', 'be', 'ch', 'it', 'es', 'se', 'no'
97
+ }
98
+
99
+ # Free hosting services
100
+ self.free_hosting = [
101
+ 'weebly.com', 'wix.com', 'wordpress.com', 'blogspot.com',
102
+ 'tumblr.com', 'jimdo.com', 'github.io', 'gitlab.io',
103
+ 'netlify.app', 'vercel.app', 'herokuapp.com', 'firebaseapp.com',
104
+ 'web.app', 'pages.dev', 'godaddysites.com', 'square.site',
105
+ '000webhostapp.com', 'sites.google.com', 'carrd.co'
106
+ ]
107
+
108
+ def extract_features(self, url: str) -> dict:
109
+ """
110
+ Extract all URL-only features from a single URL.
111
+
112
+ Args:
113
+ url: URL string
114
+
115
+ Returns:
116
+ Dictionary of features
117
+ """
118
+ try:
119
+ # Ensure URL has scheme
120
+ if not url.startswith(('http://', 'https://')):
121
+ url = 'http://' + url
122
+
123
+ parsed = urlparse(url)
124
+ domain = parsed.netloc.lower()
125
+ domain_no_port = domain.split(':')[0]
126
+ path = parsed.path
127
+ query = parsed.query
128
+
129
+ features = {}
130
+
131
+ # 1. Length features
132
+ features.update(self._length_features(url, domain_no_port, path, query))
133
+
134
+ # 2. Character count features
135
+ features.update(self._char_count_features(url, domain_no_port, path))
136
+
137
+ # 3. Ratio features
138
+ features.update(self._ratio_features(url, domain_no_port))
139
+
140
+ # 4. Domain structure features
141
+ features.update(self._domain_features(domain_no_port, parsed))
142
+
143
+ # 5. Path features
144
+ features.update(self._path_features(path))
145
+
146
+ # 6. Query features
147
+ features.update(self._query_features(query))
148
+
149
+ # 7. Statistical features (entropy, patterns)
150
+ features.update(self._statistical_features(url, domain_no_port, path))
151
+
152
+ # 8. Security indicator features
153
+ features.update(self._security_features(url, parsed, domain_no_port))
154
+
155
+ # 9. Keyword/brand features
156
+ features.update(self._keyword_features(url, domain_no_port, path))
157
+
158
+ # 10. Encoding features
159
+ features.update(self._encoding_features(url, domain_no_port))
160
+
161
+ return features
162
+
163
+ except Exception as e:
164
+ logger.error(f"Error extracting features from URL: {url[:50]}... Error: {e}")
165
+ return self._get_default_features()
166
+
167
+ def _length_features(self, url: str, domain: str, path: str, query: str) -> dict:
168
+ """Length-based features."""
169
+ return {
170
+ 'url_length': len(url),
171
+ 'domain_length': len(domain),
172
+ 'path_length': len(path),
173
+ 'query_length': len(query),
174
+
175
+ # Binary indicators
176
+ 'url_length_gt_75': 1 if len(url) > 75 else 0,
177
+ 'url_length_gt_100': 1 if len(url) > 100 else 0,
178
+ 'url_length_gt_150': 1 if len(url) > 150 else 0,
179
+ 'domain_length_gt_25': 1 if len(domain) > 25 else 0,
180
+ }
181
+
182
+ def _char_count_features(self, url: str, domain: str, path: str) -> dict:
183
+ """Character count features."""
184
+ return {
185
+ # URL character counts
186
+ 'num_dots': url.count('.'),
187
+ 'num_hyphens': url.count('-'),
188
+ 'num_underscores': url.count('_'),
189
+ 'num_slashes': url.count('/'),
190
+ 'num_question_marks': url.count('?'),
191
+ 'num_ampersands': url.count('&'),
192
+ 'num_equals': url.count('='),
193
+ 'num_at': url.count('@'),
194
+ 'num_percent': url.count('%'),
195
+ 'num_digits_url': sum(c.isdigit() for c in url),
196
+ 'num_letters_url': sum(c.isalpha() for c in url),
197
+
198
+ # Domain character counts
199
+ 'domain_dots': domain.count('.'),
200
+ 'domain_hyphens': domain.count('-'),
201
+ 'domain_digits': sum(c.isdigit() for c in domain),
202
+
203
+ # Path character counts
204
+ 'path_slashes': path.count('/'),
205
+ 'path_dots': path.count('.'),
206
+ 'path_digits': sum(c.isdigit() for c in path),
207
+ }
208
+
209
+ def _ratio_features(self, url: str, domain: str) -> dict:
210
+ """Ratio-based features."""
211
+ url_len = max(len(url), 1)
212
+ domain_len = max(len(domain), 1)
213
+
214
+ return {
215
+ 'digit_ratio_url': sum(c.isdigit() for c in url) / url_len,
216
+ 'letter_ratio_url': sum(c.isalpha() for c in url) / url_len,
217
+ 'special_char_ratio': sum(not c.isalnum() for c in url) / url_len,
218
+ 'digit_ratio_domain': sum(c.isdigit() for c in domain) / domain_len,
219
+ 'symbol_ratio_domain': sum(c in '-_.' for c in domain) / domain_len,
220
+ }
221
+
222
+ def _domain_features(self, domain: str, parsed) -> dict:
223
+ """Domain structure features."""
224
+ parts = domain.split('.')
225
+ tld = parts[-1] if parts else ''
226
+
227
+ # Get SLD (second level domain)
228
+ sld = parts[-2] if len(parts) > 1 else ''
229
+
230
+ # Count subdomains (parts minus domain and TLD)
231
+ num_subdomains = max(0, len(parts) - 2)
232
+
233
+ return {
234
+ 'num_subdomains': num_subdomains,
235
+ 'num_domain_parts': len(parts),
236
+ 'tld_length': len(tld),
237
+ 'sld_length': len(sld),
238
+ 'longest_domain_part': max((len(p) for p in parts), default=0),
239
+ 'avg_domain_part_len': sum(len(p) for p in parts) / max(len(parts), 1),
240
+
241
+ # TLD indicators
242
+ 'has_suspicious_tld': 1 if tld in self.suspicious_tlds else 0,
243
+ 'has_trusted_tld': 1 if tld in self.trusted_tlds else 0,
244
+
245
+ # Port
246
+ 'has_port': 1 if parsed.port else 0,
247
+ 'has_non_std_port': 1 if parsed.port and parsed.port not in [80, 443] else 0,
248
+ }
249
+
250
+ def _path_features(self, path: str) -> dict:
251
+ """Path structure features."""
252
+ segments = [s for s in path.split('/') if s]
253
+
254
+ # Get file extension if present
255
+ extension = ''
256
+ if '.' in path:
257
+ potential_ext = path.rsplit('.', 1)[-1].split('?')[0].lower()
258
+ if len(potential_ext) <= 10:
259
+ extension = potential_ext
260
+
261
+ return {
262
+ 'path_depth': len(segments),
263
+ 'max_path_segment_len': max((len(s) for s in segments), default=0),
264
+ 'avg_path_segment_len': sum(len(s) for s in segments) / max(len(segments), 1),
265
+
266
+ # Extension features
267
+ 'has_extension': 1 if extension else 0,
268
+ 'has_php': 1 if extension == 'php' else 0,
269
+ 'has_html': 1 if extension in ['html', 'htm'] else 0,
270
+ 'has_exe': 1 if extension in ['exe', 'bat', 'cmd', 'msi'] else 0,
271
+
272
+ # Suspicious path patterns
273
+ 'has_double_slash': 1 if '//' in path else 0,
274
+ }
275
+
276
+ def _query_features(self, query: str) -> dict:
277
+ """Query string features."""
278
+ params = parse_qs(query)
279
+
280
+ return {
281
+ 'num_params': len(params),
282
+ 'has_query': 1 if query else 0,
283
+ 'query_value_length': sum(len(''.join(v)) for v in params.values()),
284
+ 'max_param_len': max((len(k) + len(''.join(v)) for k, v in params.items()), default=0),
285
+ }
286
+
287
+ def _statistical_features(self, url: str, domain: str, path: str) -> dict:
288
+ """Statistical and entropy features."""
289
+ return {
290
+ # Entropy
291
+ 'url_entropy': self._entropy(url),
292
+ 'domain_entropy': self._entropy(domain),
293
+ 'path_entropy': self._entropy(path) if path else 0,
294
+
295
+ # Consecutive character patterns
296
+ 'max_consecutive_digits': self._max_consecutive(url, str.isdigit),
297
+ 'max_consecutive_chars': self._max_consecutive(url, str.isalpha),
298
+ 'max_consecutive_consonants': self._max_consecutive_consonants(domain),
299
+
300
+ # Character variance
301
+ 'char_repeat_rate': self._repeat_rate(url),
302
+
303
+ # N-gram uniqueness
304
+ 'unique_bigram_ratio': self._unique_ngram_ratio(url, 2),
305
+ 'unique_trigram_ratio': self._unique_ngram_ratio(url, 3),
306
+
307
+ # Vowel/consonant ratio in domain
308
+ 'vowel_ratio_domain': self._vowel_ratio(domain),
309
+ }
310
+
311
+ def _security_features(self, url: str, parsed, domain: str) -> dict:
312
+ """Security indicator features (URL-based only)."""
313
+ return {
314
+ # Protocol
315
+ 'is_https': 1 if parsed.scheme == 'https' else 0,
316
+ 'is_http': 1 if parsed.scheme == 'http' else 0,
317
+
318
+ # IP address
319
+ 'has_ip_address': 1 if self._is_ip(domain) else 0,
320
+
321
+ # Suspicious patterns
322
+ 'has_at_symbol': 1 if '@' in url else 0,
323
+ 'has_redirect': 1 if 'redirect' in url.lower() or 'url=' in url.lower() else 0,
324
+
325
+ # URL shortener
326
+ 'is_shortened': 1 if any(s in domain for s in self.shorteners) else 0,
327
+
328
+ # Free hosting
329
+ 'is_free_hosting': 1 if any(h in domain for h in self.free_hosting) else 0,
330
+
331
+ # www presence
332
+ 'has_www': 1 if domain.startswith('www.') else 0,
333
+ 'www_in_middle': 1 if 'www' in domain and not domain.startswith('www') else 0,
334
+ }
335
+
336
+ def _keyword_features(self, url: str, domain: str, path: str) -> dict:
337
+ """Keyword and brand detection features."""
338
+ url_lower = url.lower()
339
+ domain_lower = domain.lower()
340
+ path_lower = path.lower()
341
+
342
+ # Count phishing keywords
343
+ phishing_in_url = sum(1 for k in self.phishing_keywords if k in url_lower)
344
+ phishing_in_domain = sum(1 for k in self.phishing_keywords if k in domain_lower)
345
+ phishing_in_path = sum(1 for k in self.phishing_keywords if k in path_lower)
346
+
347
+ # Count brand names
348
+ brands_in_url = sum(1 for b in self.brand_names if b in url_lower)
349
+ brands_in_domain = sum(1 for b in self.brand_names if b in domain_lower)
350
+ brands_in_path = sum(1 for b in self.brand_names if b in path_lower)
351
+
352
+ # Brand impersonation: brand in path but not in domain
353
+ brand_impersonation = 1 if brands_in_path > 0 and brands_in_domain == 0 else 0
354
+
355
+ return {
356
+ 'num_phishing_keywords': phishing_in_url,
357
+ 'phishing_in_domain': phishing_in_domain,
358
+ 'phishing_in_path': phishing_in_path,
359
+
360
+ 'num_brands': brands_in_url,
361
+ 'brand_in_domain': 1 if brands_in_domain > 0 else 0,
362
+ 'brand_in_path': 1 if brands_in_path > 0 else 0,
363
+ 'brand_impersonation': brand_impersonation,
364
+
365
+ # Specific high-value keywords
366
+ 'has_login': 1 if 'login' in url_lower or 'signin' in url_lower else 0,
367
+ 'has_account': 1 if 'account' in url_lower else 0,
368
+ 'has_verify': 1 if 'verify' in url_lower or 'confirm' in url_lower else 0,
369
+ 'has_secure': 1 if 'secure' in url_lower or 'security' in url_lower else 0,
370
+ 'has_update': 1 if 'update' in url_lower else 0,
371
+ 'has_bank': 1 if 'bank' in url_lower else 0,
372
+ 'has_password': 1 if 'password' in url_lower or 'passwd' in url_lower else 0,
373
+ 'has_suspend': 1 if 'suspend' in url_lower or 'locked' in url_lower else 0,
374
+
375
+ # Suspicious patterns
376
+ 'has_webscr': 1 if 'webscr' in url_lower else 0,
377
+ 'has_cmd': 1 if 'cmd=' in url_lower else 0,
378
+ 'has_cgi': 1 if 'cgi-bin' in url_lower or 'cgi_bin' in url_lower else 0,
379
+ }
380
+
381
+ def _encoding_features(self, url: str, domain: str) -> dict:
382
+ """Encoding-related features."""
383
+ # Check for punycode
384
+ has_punycode = 'xn--' in domain
385
+
386
+ # Decode and check difference
387
+ try:
388
+ decoded = unquote(url)
389
+ encoding_diff = len(decoded) - len(url)
390
+ except:
391
+ encoding_diff = 0
392
+
393
+ # Safe regex checks (wrap in try-except for malformed URLs)
394
+ try:
395
+ has_hex = 1 if re.search(r'[0-9a-f]{20,}', url.lower()) else 0
396
+ except:
397
+ has_hex = 0
398
+
399
+ try:
400
+ has_base64 = 1 if re.search(r'[A-Za-z0-9+/]{30,}={0,2}', url) else 0
401
+ except:
402
+ has_base64 = 0
403
+
404
+ try:
405
+ has_unicode = 1 if any(ord(c) > 127 for c in url) else 0
406
+ except:
407
+ has_unicode = 0
408
+
409
+ return {
410
+ 'has_url_encoding': 1 if '%' in url else 0,
411
+ 'encoding_count': url.count('%'),
412
+ 'encoding_diff': abs(encoding_diff),
413
+ 'has_punycode': 1 if has_punycode else 0,
414
+ 'has_unicode': has_unicode,
415
+ 'has_hex_string': has_hex,
416
+ 'has_base64': has_base64,
417
+ }
418
+
419
+ # Helper methods
420
+ def _entropy(self, text: str) -> float:
421
+ """Calculate Shannon entropy."""
422
+ if not text:
423
+ return 0.0
424
+ freq = Counter(text)
425
+ length = len(text)
426
+ return -sum((c / length) * math.log2(c / length) for c in freq.values())
427
+
428
+ def _max_consecutive(self, text: str, condition) -> int:
429
+ """Max consecutive characters matching condition."""
430
+ max_count = count = 0
431
+ for char in text:
432
+ if condition(char):
433
+ count += 1
434
+ max_count = max(max_count, count)
435
+ else:
436
+ count = 0
437
+ return max_count
438
+
439
+ def _max_consecutive_consonants(self, text: str) -> int:
440
+ """Max consecutive consonants."""
441
+ consonants = set('bcdfghjklmnpqrstvwxyz')
442
+ max_count = count = 0
443
+ for char in text.lower():
444
+ if char in consonants:
445
+ count += 1
446
+ max_count = max(max_count, count)
447
+ else:
448
+ count = 0
449
+ return max_count
450
+
451
+ def _repeat_rate(self, text: str) -> float:
452
+ """Rate of repeated adjacent characters."""
453
+ if len(text) < 2:
454
+ return 0.0
455
+ repeats = sum(1 for i in range(len(text) - 1) if text[i] == text[i + 1])
456
+ return repeats / (len(text) - 1)
457
+
458
+ def _unique_ngram_ratio(self, text: str, n: int) -> float:
459
+ """Ratio of unique n-grams to total n-grams."""
460
+ if len(text) < n:
461
+ return 0.0
462
+ ngrams = [text[i:i + n] for i in range(len(text) - n + 1)]
463
+ return len(set(ngrams)) / len(ngrams)
464
+
465
+ def _vowel_ratio(self, text: str) -> float:
466
+ """Ratio of vowels in text."""
467
+ if not text:
468
+ return 0.0
469
+ vowels = sum(1 for c in text.lower() if c in 'aeiou')
470
+ letters = sum(1 for c in text if c.isalpha())
471
+ return vowels / max(letters, 1)
472
+
473
+ def _is_ip(self, domain: str) -> bool:
474
+ """Check if domain is IP address."""
475
+ # IPv4
476
+ if re.match(r'^(\d{1,3}\.){3}\d{1,3}$', domain):
477
+ return True
478
+ # IPv6
479
+ try:
480
+ socket.inet_pton(socket.AF_INET6, domain.strip('[]'))
481
+ return True
482
+ except:
483
+ return False
484
+
485
+ def _get_default_features(self) -> dict:
486
+ """Default feature values for error cases."""
487
+ return {name: 0 for name in self.get_feature_names()}
488
+
489
+ def get_feature_names(self) -> list:
490
+ """Get list of all feature names."""
491
+ # Extract from a dummy URL to get all feature names
492
+ dummy_features = {
493
+ # Length features
494
+ 'url_length': 0, 'domain_length': 0, 'path_length': 0, 'query_length': 0,
495
+ 'url_length_gt_75': 0, 'url_length_gt_100': 0, 'url_length_gt_150': 0,
496
+ 'domain_length_gt_25': 0,
497
+ # Char counts
498
+ 'num_dots': 0, 'num_hyphens': 0, 'num_underscores': 0, 'num_slashes': 0,
499
+ 'num_question_marks': 0, 'num_ampersands': 0, 'num_equals': 0, 'num_at': 0,
500
+ 'num_percent': 0, 'num_digits_url': 0, 'num_letters_url': 0,
501
+ 'domain_dots': 0, 'domain_hyphens': 0, 'domain_digits': 0,
502
+ 'path_slashes': 0, 'path_dots': 0, 'path_digits': 0,
503
+ # Ratios
504
+ 'digit_ratio_url': 0, 'letter_ratio_url': 0, 'special_char_ratio': 0,
505
+ 'digit_ratio_domain': 0, 'symbol_ratio_domain': 0,
506
+ # Domain features
507
+ 'num_subdomains': 0, 'num_domain_parts': 0, 'tld_length': 0, 'sld_length': 0,
508
+ 'longest_domain_part': 0, 'avg_domain_part_len': 0,
509
+ 'has_suspicious_tld': 0, 'has_trusted_tld': 0, 'has_port': 0, 'has_non_std_port': 0,
510
+ # Path features
511
+ 'path_depth': 0, 'max_path_segment_len': 0, 'avg_path_segment_len': 0,
512
+ 'has_extension': 0, 'has_php': 0, 'has_html': 0, 'has_exe': 0, 'has_double_slash': 0,
513
+ # Query features
514
+ 'num_params': 0, 'has_query': 0, 'query_value_length': 0, 'max_param_len': 0,
515
+ # Statistical features
516
+ 'url_entropy': 0, 'domain_entropy': 0, 'path_entropy': 0,
517
+ 'max_consecutive_digits': 0, 'max_consecutive_chars': 0, 'max_consecutive_consonants': 0,
518
+ 'char_repeat_rate': 0, 'unique_bigram_ratio': 0, 'unique_trigram_ratio': 0,
519
+ 'vowel_ratio_domain': 0,
520
+ # Security features
521
+ 'is_https': 0, 'is_http': 0, 'has_ip_address': 0, 'has_at_symbol': 0,
522
+ 'has_redirect': 0, 'is_shortened': 0, 'is_free_hosting': 0, 'has_www': 0, 'www_in_middle': 0,
523
+ # Keyword features
524
+ 'num_phishing_keywords': 0, 'phishing_in_domain': 0, 'phishing_in_path': 0,
525
+ 'num_brands': 0, 'brand_in_domain': 0, 'brand_in_path': 0, 'brand_impersonation': 0,
526
+ 'has_login': 0, 'has_account': 0, 'has_verify': 0, 'has_secure': 0, 'has_update': 0,
527
+ 'has_bank': 0, 'has_password': 0, 'has_suspend': 0,
528
+ 'has_webscr': 0, 'has_cmd': 0, 'has_cgi': 0,
529
+ # Encoding features
530
+ 'has_url_encoding': 0, 'encoding_count': 0, 'encoding_diff': 0,
531
+ 'has_punycode': 0, 'has_unicode': 0, 'has_hex_string': 0, 'has_base64': 0,
532
+ }
533
+ return list(dummy_features.keys())
534
+
535
+ def extract_batch(self, urls: list, show_progress: bool = True) -> pd.DataFrame:
536
+ """
537
+ Extract features from multiple URLs.
538
+
539
+ Args:
540
+ urls: List of URL strings
541
+ show_progress: Show progress messages
542
+
543
+ Returns:
544
+ DataFrame with features
545
+ """
546
+ if show_progress:
547
+ logger.info(f"Extracting URL features from {len(urls):,} URLs...")
548
+
549
+ features_list = []
550
+ progress_interval = 50000
551
+
552
+ for i, url in enumerate(urls):
553
+ if show_progress and i > 0 and i % progress_interval == 0:
554
+ logger.info(f" Processed {i:,} / {len(urls):,} ({100 * i / len(urls):.1f}%)")
555
+
556
+ features = self.extract_features(url)
557
+ features_list.append(features)
558
+
559
+ df = pd.DataFrame(features_list)
560
+
561
+ if show_progress:
562
+ logger.info(f"✓ Extracted {len(df.columns)} features from {len(df):,} URLs")
563
+
564
+ return df
565
+
566
+
567
+ def main():
568
+ """Extract URL-only features from dataset."""
569
+ import argparse
570
+
571
+ parser = argparse.ArgumentParser(description='URL-Only Feature Extraction (Stage 1)')
572
+ parser.add_argument('--sample', type=int, default=None, help='Sample N URLs')
573
+ parser.add_argument('--output', type=str, default=None, help='Output filename')
574
+ args = parser.parse_args()
575
+
576
+ logger.info("=" * 70)
577
+ logger.info("URL-Only Feature Extraction v1")
578
+ logger.info("=" * 70)
579
+ logger.info("")
580
+ logger.info("Features: URL structure, lexical, statistical")
581
+ logger.info("NO HTTP requests, NO external APIs")
582
+ logger.info("")
583
+
584
+ # Load dataset
585
+ script_dir = Path(__file__).parent
586
+ data_file = (script_dir / '../../data/processed/clean_dataset.csv').resolve()
587
+
588
+ logger.info(f"Loading: {data_file.name}")
589
+ df = pd.read_csv(data_file)
590
+ logger.info(f"Loaded: {len(df):,} URLs")
591
+
592
+ if args.sample and args.sample < len(df):
593
+ df = df.sample(n=args.sample, random_state=42)
594
+ logger.info(f"Sampled: {len(df):,} URLs")
595
+
596
+ # Extract features
597
+ extractor = URLFeatureExtractorV2()
598
+ features_df = extractor.extract_batch(df['url'].tolist())
599
+ features_df['label'] = df['label'].values
600
+
601
+ # Save
602
+ output_dir = (script_dir / '../../data/features').resolve()
603
+ output_dir.mkdir(parents=True, exist_ok=True)
604
+
605
+ if args.output:
606
+ output_file = output_dir / args.output
607
+ else:
608
+ suffix = f'_sample{args.sample}' if args.sample else ''
609
+ output_file = output_dir / f'url_features{suffix}.csv'
610
+
611
+ features_df.to_csv(output_file, index=False)
612
+
613
+ logger.info("")
614
+ logger.info("=" * 70)
615
+ logger.info(f"✓ Saved: {output_file}")
616
+ logger.info(f" Shape: {features_df.shape}")
617
+ logger.info(f" Features: {len(features_df.columns) - 1}")
618
+ logger.info("=" * 70)
619
+
620
+ # Show stats
621
+ print("\nFeature Statistics (sample):")
622
+ print(features_df.describe().T.head(20))
623
+
624
+
625
+ if __name__ == "__main__":
626
+ main()
scripts/feature_extraction/url/url_features_v2.py ADDED
@@ -0,0 +1,1396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Feature Extraction v2 - IMPROVED VERSION
3
+
4
+ Improvements:
5
+ - Fixed free hosting detection (exact/suffix match instead of substring)
6
+ - Added free platform detection (Google Sites, Weebly, Firebase, etc.)
7
+ - Added UUID subdomain detection (Replit, Firebase patterns)
8
+ - Added platform subdomain length feature
9
+ - Added longest_part thresholds (gt_20, gt_30, gt_40)
10
+ - Expanded brand list with regional brands
11
+ - Improved extension categorization (added archive, image categories)
12
+ - Fixed get_feature_names() to be dynamic
13
+ - Better URL shortener detection
14
+
15
+ Key Features:
16
+ - Lexical (length, characters, entropy)
17
+ - Structural (domain parts, path segments, TLD)
18
+ - Statistical (entropy, n-grams, patterns)
19
+ - Security indicators (from URL only)
20
+ - Brand/phishing patterns
21
+ - FREE PLATFORM ABUSE DETECTION (NEW!)
22
+
23
+ Designed for:
24
+ - Fast inference (< 1ms per URL)
25
+ - No network dependencies
26
+ - Production deployment
27
+ """
28
+
29
+ import pandas as pd
30
+ import numpy as np
31
+ from urllib.parse import urlparse, parse_qs, unquote
32
+ import re
33
+ import math
34
+ import socket
35
+ import unicodedata
36
+ from pathlib import Path
37
+ from collections import Counter
38
+ import sys
39
+ import logging
40
+
41
+ # Setup logging
42
+ logging.basicConfig(
43
+ level=logging.INFO,
44
+ format='%(asctime)s - %(levelname)s - %(message)s',
45
+ datefmt='%H:%M:%S'
46
+ )
47
+ logger = logging.getLogger("url_features_v2")
48
+
49
+
50
+ class URLFeatureExtractorV2:
51
+ """
52
+ Fast URL-only feature extractor for Stage 1 phishing detection.
53
+
54
+ IMPROVED VERSION with better free platform detection.
55
+ """
56
+
57
+ def __init__(self):
58
+ """Initialize feature extractor with keyword lists."""
59
+
60
+ # Phishing-related keywords
61
+ self.phishing_keywords = [
62
+ 'login', 'signin', 'sign-in', 'log-in', 'logon', 'signon',
63
+ 'account', 'accounts', 'update', 'verify', 'verification',
64
+ 'secure', 'security', 'banking', 'bank', 'confirm', 'password',
65
+ 'passwd', 'credential', 'suspended', 'locked', 'unusual',
66
+ 'authenticate', 'auth', 'wallet', 'invoice', 'payment',
67
+ 'billing', 'expire', 'expired', 'limited', 'restrict',
68
+ 'urgent', 'immediately', 'alert', 'warning', 'resolve',
69
+ 'recover', 'restore', 'reactivate', 'unlock', 'validate'
70
+ ]
71
+
72
+ # Brand names - EXPANDED with regional brands
73
+ self.brand_names = [
74
+ # US Tech Giants
75
+ 'paypal', 'ebay', 'amazon', 'apple', 'microsoft', 'google',
76
+ 'facebook', 'instagram', 'twitter', 'netflix', 'linkedin',
77
+ 'dropbox', 'adobe', 'spotify', 'steam', 'zoom', 'docusign',
78
+ 'salesforce', 'shopify', 'square', 'venmo', 'cashapp', 'zelle',
79
+
80
+ # US Banks
81
+ 'chase', 'wellsfargo', 'bankofamerica', 'citibank', 'citi',
82
+ 'americanexpress', 'amex', 'visa', 'mastercard',
83
+ 'capitalone', 'usbank', 'pnc', 'truist',
84
+
85
+ # Email/Communication
86
+ 'outlook', 'office365', 'office', 'yahoo', 'aol', 'icloud',
87
+ 'gmail', 'protonmail', 'whatsapp', 'telegram', 'discord',
88
+ 'signal', 'skype', 'teams',
89
+
90
+ # Shipping/Logistics
91
+ 'dhl', 'fedex', 'ups', 'usps', 'amazon', 'alibaba',
92
+
93
+ # Crypto/Finance
94
+ 'coinbase', 'binance', 'blockchain', 'metamask', 'kraken',
95
+ 'gemini', 'robinhood', 'etrade', 'fidelity', 'schwab',
96
+ 'payoneer', 'stripe', 'wise', 'revolut',
97
+
98
+ # Social/Entertainment
99
+ 'tiktok', 'snapchat', 'twitch', 'roblox', 'epic', 'epicgames',
100
+ 'playstation', 'xbox', 'nintendo', 'blizzard', 'riot',
101
+
102
+ # REGIONAL BRANDS (from analysis)
103
+ # Europe
104
+ 'allegro', 'allegrolokalnie', # Poland
105
+ 'olx', # Europe/LatAm
106
+ 'bol', 'marktplaats', # Netherlands
107
+ 'leboncoin', # France
108
+ 'idealo', 'otto', # Germany
109
+ 'hsbc', 'barclays', 'santander', 'ing', 'revolut', # European banks
110
+
111
+ # Asia
112
+ 'rakuten', # Japan
113
+ 'lazada', 'shopee', # Southeast Asia
114
+ 'baidu', 'taobao', 'alipay', 'wechat', 'weibo', # China
115
+ 'paytm', 'phonepe', # India
116
+
117
+ # Latin America
118
+ 'mercadolibre', 'mercadopago', # LatAm
119
+
120
+ # Russia
121
+ 'yandex', 'vk', 'mailru',
122
+
123
+ # Other
124
+ 'uber', 'lyft', 'airbnb', 'booking', 'expedia',
125
+ 'wetransfer', 'mediafire', 'mega',
126
+ ]
127
+
128
+ # URL shorteners - EXACT MATCH ONLY
129
+ self.shorteners = {
130
+ # Original
131
+ 'bit.ly', 'bitly.com', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly',
132
+ 'is.gd', 'buff.ly', 'adf.ly', 'bit.do', 'short.to', 'tiny.cc',
133
+ 'j.mp', 'surl.li', 'rb.gy', 'cutt.ly', 'qrco.de', 'v.gd',
134
+ 'shorturl.at', 'rebrand.ly', 'clck.ru', 's.id', 'shrtco.de',
135
+
136
+ # NEW from analysis (CRITICAL!)
137
+ 'qrco.de', # 3,824 occurrences!
138
+ 'q-r.to', # 2,974
139
+ 'l.ead.me', # 2,907
140
+ 'ead.me', # Base domain
141
+ 'urlz.fr',
142
+ 'hotm.art',
143
+ 'reurl.cc',
144
+ 'did.li',
145
+ 'zpr.io',
146
+ 'linkin.bio',
147
+ 'linqapp.com',
148
+ 'linktr.ee',
149
+ 'flow.page',
150
+ 'campsite.bio',
151
+ 'qr-codes.io',
152
+ 'scanned.page',
153
+ 'l.wl.co',
154
+ 'wl.co',
155
+ 'hm.ru',
156
+ 'flowcode.com',
157
+ }
158
+
159
+ # Suspicious TLDs
160
+ self.suspicious_tlds = {
161
+ 'tk', 'ml', 'ga', 'cf', 'gq', # Free domains
162
+ 'xyz', 'top', 'club', 'work', 'date', 'racing', 'win',
163
+ 'loan', 'download', 'stream', 'click', 'link', 'bid',
164
+ 'review', 'party', 'trade', 'webcam', 'science',
165
+ 'accountant', 'faith', 'cricket', 'zip', 'mov',
166
+ 'icu', 'buzz', 'space', 'online', 'site', 'website',
167
+ 'tech', 'store', 'rest', 'cfd', 'monster', 'sbs'
168
+ }
169
+
170
+ # Trusted TLDs
171
+ self.trusted_tlds = {
172
+ 'com', 'org', 'net', 'edu', 'gov', 'mil',
173
+ 'uk', 'us', 'ca', 'de', 'fr', 'jp', 'au',
174
+ 'nl', 'be', 'ch', 'it', 'es', 'se', 'no',
175
+ 'pl', 'br', 'in', 'mx', 'kr', 'ru', 'cn'
176
+ }
177
+
178
+ # FREE PLATFORMS - EXACT/SUFFIX MATCH (from your PhishTank analysis!)
179
+ self.free_platforms = {
180
+ # Website Builders
181
+ 'weebly.com', 'wixsite.com', 'wix.com', 'webflow.io',
182
+ 'framer.website', 'carrd.co', 'notion.site', 'webwave.me',
183
+ 'godaddysites.com', 'square.site', 'sites.google.com',
184
+
185
+ # Google Platforms (HIGH PHISHING RATE from analysis)
186
+ 'firebaseapp.com', 'web.app', 'appspot.com',
187
+ 'firebase.app', 'page.link',
188
+
189
+ # Developer Platforms (from analysis: Replit, Vercel, etc.)
190
+ 'github.io', 'gitlab.io', 'pages.github.com',
191
+ 'vercel.app', 'netlify.app', 'netlify.com',
192
+ 'replit.dev', 'repl.co', 'replit.co',
193
+ 'glitch.me', 'glitch.com',
194
+ 'pages.dev', 'workers.dev', # Cloudflare
195
+ 'herokuapp.com', 'heroku.com',
196
+ 'onrender.com', 'railway.app', 'fly.dev',
197
+ 'amplifyapp.com', # AWS Amplify
198
+ 'surge.sh', 'now.sh',
199
+
200
+ # Blogging/CMS
201
+ 'wordpress.com', 'blogspot.com', 'blogger.com',
202
+ 'tumblr.com', 'medium.com', 'ghost.io',
203
+ 'substack.com', 'beehiiv.com',
204
+
205
+ # Adobe/Creative
206
+ 'adobesites.com', 'myportfolio.com', 'behance.net',
207
+ 'adobe.com', 'framer.app',
208
+
209
+ # Forms/Surveys (from analysis: jotform, hsforms)
210
+ 'jotform.com', 'typeform.com', 'forms.gle',
211
+ 'hsforms.com', 'hubspot.com', 'surveymonkey.com',
212
+ 'formstack.com', 'cognito.com',
213
+
214
+ # File Sharing
215
+ 'dropboxusercontent.com', 'dl.dropboxusercontent.com',
216
+ 'sharepoint.com', '1drv.ms', 'onedrive.live.com',
217
+ 'box.com', 'wetransfer.com', 'we.tl',
218
+
219
+ # Free Hosting
220
+ '000webhostapp.com', 'freehosting.com', 'freehostia.com',
221
+ '5gbfree.com', 'x10hosting.com', 'awardspace.com',
222
+ 'byet.host', 'infinityfree.com',
223
+
224
+ # Education/Sandbox
225
+ 'repl.it', 'codepen.io', 'jsfiddle.net', 'codesandbox.io',
226
+ 'stackblitz.com', 'observablehq.com',
227
+
228
+ # Other (from analysis)
229
+ 'webcindario.com', 'gitbook.io', 'tinyurl.com',
230
+ 'start.page', 'my.site', 'site123.com'
231
+ }
232
+
233
+ # Common English words for dictionary check
234
+ self.common_words = {
235
+ 'about', 'account', 'after', 'again', 'all', 'also', 'america', 'american',
236
+ 'another', 'answer', 'any', 'app', 'apple', 'area', 'back', 'bank', 'best',
237
+ 'between', 'book', 'business', 'call', 'can', 'card', 'care', 'case', 'center',
238
+ 'central', 'change', 'check', 'city', 'class', 'cloud', 'come', 'company',
239
+ 'contact', 'control', 'country', 'course', 'credit', 'data', 'day', 'dept',
240
+ 'department', 'different', 'digital', 'doctor', 'down', 'east', 'easy', 'end',
241
+ 'energy', 'even', 'event', 'every', 'express', 'fact', 'family', 'feel',
242
+ 'field', 'file', 'find', 'first', 'food', 'form', 'free', 'friend', 'from',
243
+ 'game', 'general', 'get', 'give', 'global', 'good', 'government', 'great',
244
+ 'group', 'hand', 'have', 'head', 'health', 'help', 'here', 'high', 'home',
245
+ 'house', 'how', 'image', 'info', 'information', 'insurance', 'international',
246
+ 'into', 'just', 'keep', 'kind', 'know', 'large', 'last', 'late', 'leave',
247
+ 'left', 'legal', 'life', 'like', 'line', 'little', 'local', 'long', 'look',
248
+ 'love', 'mail', 'main', 'make', 'management', 'manager', 'many', 'map', 'market',
249
+ 'marketing', 'media', 'medical', 'member', 'message', 'money', 'month', 'more',
250
+ 'most', 'move', 'music', 'name', 'national', 'need', 'network', 'never', 'new',
251
+ 'news', 'next', 'north', 'not', 'note', 'number', 'office', 'official', 'old',
252
+ 'online', 'only', 'open', 'order', 'other', 'over', 'page', 'part', 'party',
253
+ 'people', 'person', 'personal', 'photo', 'place', 'plan', 'play', 'plus', 'point',
254
+ 'policy', 'portal', 'post', 'power', 'press', 'price', 'private', 'product',
255
+ 'program', 'project', 'property', 'public', 'quality', 'question', 'quick', 'rate',
256
+ 'read', 'real', 'record', 'report', 'research', 'resource', 'result', 'right',
257
+ 'room', 'sale', 'sales', 'save', 'school', 'search', 'second', 'section',
258
+ 'security', 'see', 'senior', 'service', 'services', 'set', 'shop', 'show',
259
+ 'side', 'sign', 'site', 'small', 'social', 'software', 'solution', 'solutions',
260
+ 'some', 'south', 'space', 'special', 'staff', 'start', 'state', 'store', 'story',
261
+ 'student', 'study', 'support', 'sure', 'system', 'systems', 'take', 'team', 'tech',
262
+ 'technology', 'test', 'text', 'than', 'that', 'their', 'them', 'then', 'there',
263
+ 'these', 'they', 'thing', 'think', 'this', 'those', 'through', 'time', 'today',
264
+ 'together', 'total', 'trade', 'training', 'travel', 'trust', 'type', 'under',
265
+ 'university', 'until', 'update', 'upon', 'user', 'value', 'very', 'video',
266
+ 'view', 'want', 'water', 'website', 'week', 'well', 'west', 'what', 'when',
267
+ 'where', 'which', 'while', 'white', 'will', 'with', 'within', 'without', 'woman',
268
+ 'women', 'word', 'work', 'world', 'would', 'write', 'year', 'york', 'young', 'your'
269
+ }
270
+
271
+ # Keyboard patterns
272
+ self.keyboard_patterns = [
273
+ 'qwerty', 'asdfgh', 'zxcvbn', '12345', '123456', '1234567', '12345678',
274
+ 'qwertyuiop', 'asdfghjkl', 'zxcvbnm'
275
+ ]
276
+
277
+ # Lookalike character mappings
278
+ self.lookalike_chars = {
279
+ '0': 'o', 'o': '0',
280
+ '1': 'l', 'l': '1', 'i': '1',
281
+ 'rn': 'm', 'vv': 'w', 'cl': 'd'
282
+ }
283
+
284
+ self.microsoft_services = {
285
+ 'forms.office.com',
286
+ 'sharepoint.com',
287
+ 'onedrive.live.com',
288
+ '1drv.ms',
289
+ }
290
+
291
+ self.zoom_services = {
292
+ 'docs.zoom.us',
293
+ 'zoom.us',
294
+ }
295
+
296
+ self.adobe_services = {
297
+ 'express.adobe.com',
298
+ 'new.express.adobe.com', # Multi-level!
299
+ 'spark.adobe.com',
300
+ 'portfolio.adobe.com',
301
+ }
302
+
303
+ self.google_services = {
304
+ 'docs.google.com',
305
+ 'sites.google.com',
306
+ 'drive.google.com',
307
+ 'script.google.com',
308
+ 'storage.googleapis.com',
309
+ 'storage.cloud.google.com',
310
+ 'forms.google.com',
311
+ 'calendar.google.com',
312
+ 'meet.google.com',
313
+ }
314
+
315
+
316
+ def extract_features(self, url: str) -> dict:
317
+ """
318
+ Extract all URL-only features from a single URL.
319
+
320
+ Args:
321
+ url: URL string
322
+
323
+ Returns:
324
+ Dictionary of features
325
+ """
326
+ try:
327
+ # Ensure URL has scheme
328
+ if not url.startswith(('http://', 'https://')):
329
+ url = 'http://' + url
330
+
331
+ parsed = urlparse(url)
332
+ domain = parsed.netloc.lower()
333
+ domain_no_port = domain.split(':')[0]
334
+ path = parsed.path
335
+ query = parsed.query
336
+
337
+ features = {}
338
+
339
+ # 1. Length features
340
+ features.update(self._length_features(url, domain_no_port, path, query))
341
+
342
+ # 2. Character count features
343
+ features.update(self._char_count_features(url, domain_no_port, path))
344
+
345
+ # 3. Ratio features
346
+ features.update(self._ratio_features(url, domain_no_port))
347
+
348
+ # 4. Domain structure features
349
+ features.update(self._domain_features(domain_no_port, parsed))
350
+
351
+ # 5. Path features
352
+ features.update(self._path_features(path, domain_no_port))
353
+
354
+ # 6. Query features
355
+ features.update(self._query_features(query))
356
+
357
+ # 7. Statistical features (entropy, patterns)
358
+ features.update(self._statistical_features(url, domain_no_port, path))
359
+
360
+ # 8. Security indicator features
361
+ features.update(self._security_features(url, parsed, domain_no_port))
362
+
363
+ # 9. Keyword/brand features
364
+ features.update(self._keyword_features(url, domain_no_port, path, parsed))
365
+
366
+ # 10. Encoding features
367
+ features.update(self._encoding_features(url, domain_no_port))
368
+
369
+ return features
370
+
371
+ except Exception as e:
372
+ logger.error(f"Error extracting features from URL: {url[:50]}... Error: {e}")
373
+ return self._get_default_features()
374
+
375
+ def _length_features(self, url: str, domain: str, path: str, query: str) -> dict:
376
+ """Length-based features."""
377
+ return {
378
+ 'url_length': len(url),
379
+ 'domain_length': len(domain),
380
+ 'path_length': len(path),
381
+ 'query_length': len(query),
382
+
383
+ # Categorical length encoding
384
+ 'url_length_category': self._categorize_length(len(url), [30, 75, 150]),
385
+ 'domain_length_category': self._categorize_length(len(domain), [10, 20, 30]),
386
+ }
387
+
388
+ def _char_count_features(self, url: str, domain: str, path: str) -> dict:
389
+ """Character count features."""
390
+ return {
391
+ # URL character counts
392
+ 'num_dots': url.count('.'),
393
+ 'num_hyphens': url.count('-'),
394
+ 'num_underscores': url.count('_'),
395
+ 'num_slashes': url.count('/'),
396
+ 'num_question_marks': url.count('?'),
397
+ 'num_ampersands': url.count('&'),
398
+ 'num_equals': url.count('='),
399
+ 'num_at': url.count('@'),
400
+ 'num_percent': url.count('%'),
401
+ 'num_digits_url': sum(c.isdigit() for c in url),
402
+ 'num_letters_url': sum(c.isalpha() for c in url),
403
+
404
+ # Domain character counts
405
+ 'domain_dots': domain.count('.'),
406
+ 'domain_hyphens': domain.count('-'),
407
+ 'domain_digits': sum(c.isdigit() for c in domain),
408
+
409
+ # Path character counts
410
+ 'path_slashes': path.count('/'),
411
+ 'path_dots': path.count('.'),
412
+ 'path_digits': sum(c.isdigit() for c in path),
413
+ }
414
+
415
+ def _ratio_features(self, url: str, domain: str) -> dict:
416
+ """Ratio-based features."""
417
+ url_len = max(len(url), 1)
418
+ domain_len = max(len(domain), 1)
419
+
420
+ return {
421
+ 'digit_ratio_url': sum(c.isdigit() for c in url) / url_len,
422
+ 'letter_ratio_url': sum(c.isalpha() for c in url) / url_len,
423
+ 'special_char_ratio': sum(not c.isalnum() for c in url) / url_len,
424
+ 'digit_ratio_domain': sum(c.isdigit() for c in domain) / domain_len,
425
+ 'symbol_ratio_domain': sum(c in '-_.' for c in domain) / domain_len,
426
+ }
427
+
428
+ def _domain_features(self, domain: str, parsed) -> dict:
429
+ """Domain structure features."""
430
+ parts = domain.split('.')
431
+ tld = parts[-1] if parts else ''
432
+ sld = parts[-2] if len(parts) > 1 else ''
433
+ num_subdomains = max(0, len(parts) - 2)
434
+ longest_part = max((len(p) for p in parts), default=0)
435
+
436
+ return {
437
+ 'num_subdomains': num_subdomains,
438
+ 'num_domain_parts': len(parts),
439
+ 'tld_length': len(tld),
440
+ 'sld_length': len(sld),
441
+ 'longest_domain_part': longest_part,
442
+ 'avg_domain_part_len': sum(len(p) for p in parts) / max(len(parts), 1),
443
+
444
+ # NEW: Longest part thresholds (from analysis!)
445
+ 'longest_part_gt_20': 1 if longest_part > 20 else 0,
446
+ 'longest_part_gt_30': 1 if longest_part > 30 else 0,
447
+ 'longest_part_gt_40': 1 if longest_part > 40 else 0,
448
+
449
+ # TLD indicators
450
+ 'has_suspicious_tld': 1 if tld in self.suspicious_tlds else 0,
451
+ 'has_trusted_tld': 1 if tld in self.trusted_tlds else 0,
452
+
453
+ # Port
454
+ 'has_port': 1 if parsed.port else 0,
455
+ 'has_non_std_port': 1 if parsed.port and parsed.port not in [80, 443] else 0,
456
+
457
+ # Domain randomness features
458
+ 'domain_randomness_score': self._calculate_domain_randomness(sld),
459
+ 'sld_consonant_cluster_score': self._consonant_clustering_score(sld),
460
+ 'sld_keyboard_pattern': self._keyboard_pattern_score(sld),
461
+ 'sld_has_dictionary_word': self._contains_dictionary_word(sld),
462
+ 'sld_pronounceability_score': self._pronounceability_score(sld),
463
+ 'domain_digit_position_suspicious': self._suspicious_digit_position(sld),
464
+ }
465
+
466
+ def _path_features(self, path: str, domain: str) -> dict:
467
+ """Path structure features."""
468
+ segments = [s for s in path.split('/') if s]
469
+
470
+ # Get file extension if present
471
+ extension = ''
472
+ if '.' in path:
473
+ potential_ext = path.rsplit('.', 1)[-1].split('?')[0].lower()
474
+ if len(potential_ext) <= 10:
475
+ extension = potential_ext
476
+
477
+ return {
478
+ 'path_depth': len(segments),
479
+ 'max_path_segment_len': max((len(s) for s in segments), default=0),
480
+ 'avg_path_segment_len': sum(len(s) for s in segments) / max(len(segments), 1),
481
+
482
+ # Extension features
483
+ 'has_extension': 1 if extension else 0,
484
+ 'extension_category': self._categorize_extension(extension),
485
+ 'has_suspicious_extension': 1 if extension in ['zip', 'exe', 'apk', 'scr', 'bat', 'cmd'] else 0,
486
+ 'has_exe': 1 if extension in ['exe', 'bat', 'cmd', 'msi'] else 0,
487
+
488
+ # Suspicious path patterns
489
+ 'has_double_slash': 1 if '//' in path else 0,
490
+ 'path_has_brand_not_domain': self._brand_in_path_only(path, domain),
491
+ 'path_has_ip_pattern': 1 if re.search(r'\d{1,3}[._-]\d{1,3}[._-]\d{1,3}', path) else 0,
492
+ 'suspicious_path_extension_combo': self._suspicious_extension_pattern(path),
493
+ }
494
+
495
+ def _query_features(self, query: str) -> dict:
496
+ """Query string features."""
497
+ params = parse_qs(query)
498
+
499
+ return {
500
+ 'num_params': len(params),
501
+ 'has_query': 1 if query else 0,
502
+ 'query_value_length': sum(len(''.join(v)) for v in params.values()),
503
+ 'max_param_len': max((len(k) + len(''.join(v)) for k, v in params.items()), default=0),
504
+ 'query_has_url': 1 if re.search(r'https?%3A%2F%2F|http%3A//', query.lower()) else 0,
505
+ }
506
+
507
+ def _statistical_features(self, url: str, domain: str, path: str) -> dict:
508
+ """Statistical and entropy features."""
509
+ parts = domain.split('.')
510
+ sld = parts[-2] if len(parts) > 1 else domain
511
+
512
+ return {
513
+ # Entropy
514
+ 'url_entropy': self._entropy(url),
515
+ 'domain_entropy': self._entropy(domain),
516
+ 'path_entropy': self._entropy(path) if path else 0,
517
+
518
+ # Consecutive character patterns
519
+ 'max_consecutive_digits': self._max_consecutive(url, str.isdigit),
520
+ 'max_consecutive_chars': self._max_consecutive(url, str.isalpha),
521
+ 'max_consecutive_consonants': self._max_consecutive_consonants(domain),
522
+
523
+ # Character variance
524
+ 'char_repeat_rate': self._repeat_rate(url),
525
+
526
+ # N-gram uniqueness
527
+ 'unique_bigram_ratio': self._unique_ngram_ratio(url, 2),
528
+ 'unique_trigram_ratio': self._unique_ngram_ratio(url, 3),
529
+
530
+ # Improved statistical features
531
+ 'sld_letter_diversity': self._character_diversity(sld),
532
+ 'domain_has_numbers_letters': 1 if any(c.isdigit() for c in domain) and any(c.isalpha() for c in domain) else 0,
533
+ 'url_complexity_score': self._calculate_url_complexity(url),
534
+ }
535
+
536
+ def _security_features(self, url: str, parsed, domain: str) -> dict:
537
+ """Security indicator features (URL-based only)."""
538
+ parts = domain.split('.')
539
+
540
+ return {
541
+ # IP address
542
+ 'has_ip_address': 1 if self._is_ip(domain) else 0,
543
+
544
+ # Suspicious patterns
545
+ 'has_at_symbol': 1 if '@' in url else 0,
546
+ 'has_redirect': 1 if 'redirect' in url.lower() or 'url=' in url.lower() else 0,
547
+
548
+ # URL shortener - FIXED: exact match only
549
+ 'is_shortened': self._is_url_shortener(domain),
550
+
551
+ # Free hosting - DEPRECATED (use is_free_platform instead)
552
+ 'is_free_hosting': self._is_free_platform(domain),
553
+
554
+ # NEW: Free platform detection (CRITICAL for your dataset!)
555
+ 'is_free_platform': self._is_free_platform(domain),
556
+ 'platform_subdomain_length': self._get_platform_subdomain_length(domain),
557
+ 'has_uuid_subdomain': self._detect_uuid_pattern(domain),
558
+ }
559
+
560
+ def _keyword_features(self, url: str, domain: str, path: str, parsed) -> dict:
561
+ """Keyword and brand detection features."""
562
+ url_lower = url.lower()
563
+ domain_lower = domain.lower()
564
+ path_lower = path.lower()
565
+
566
+ # Count phishing keywords
567
+ phishing_in_url = sum(1 for k in self.phishing_keywords if k in url_lower)
568
+ phishing_in_domain = sum(1 for k in self.phishing_keywords if k in domain_lower)
569
+ phishing_in_path = sum(1 for k in self.phishing_keywords if k in path_lower)
570
+
571
+ # Count brand names
572
+ brands_in_url = sum(1 for b in self.brand_names if b in url_lower)
573
+ brands_in_domain = sum(1 for b in self.brand_names if b in domain_lower)
574
+ brands_in_path = sum(1 for b in self.brand_names if b in path_lower)
575
+
576
+ # Brand impersonation
577
+ brand_impersonation = 1 if brands_in_path > 0 and brands_in_domain == 0 else 0
578
+
579
+ return {
580
+ 'num_phishing_keywords': phishing_in_url,
581
+ 'phishing_in_domain': phishing_in_domain,
582
+ 'phishing_in_path': phishing_in_path,
583
+
584
+ 'num_brands': brands_in_url,
585
+ 'brand_in_domain': 1 if brands_in_domain > 0 else 0,
586
+ 'brand_in_path': 1 if brands_in_path > 0 else 0,
587
+ 'brand_impersonation': brand_impersonation,
588
+
589
+ # Specific high-value keywords
590
+ 'has_login': 1 if 'login' in url_lower or 'signin' in url_lower else 0,
591
+ 'has_account': 1 if 'account' in url_lower else 0,
592
+ 'has_verify': 1 if 'verify' in url_lower or 'confirm' in url_lower else 0,
593
+ 'has_secure': 1 if 'secure' in url_lower or 'security' in url_lower else 0,
594
+ 'has_update': 1 if 'update' in url_lower else 0,
595
+ 'has_bank': 1 if 'bank' in url_lower else 0,
596
+ 'has_password': 1 if 'password' in url_lower or 'passwd' in url_lower else 0,
597
+ 'has_suspend': 1 if 'suspend' in url_lower or 'locked' in url_lower else 0,
598
+
599
+ # Suspicious patterns
600
+ 'has_webscr': 1 if 'webscr' in url_lower else 0,
601
+ 'has_cmd': 1 if 'cmd=' in url_lower else 0,
602
+ 'has_cgi': 1 if 'cgi-bin' in url_lower or 'cgi_bin' in url_lower else 0,
603
+
604
+ # Advanced brand spoofing features
605
+ 'brand_in_subdomain_not_domain': self._brand_subdomain_spoofing(parsed),
606
+ 'multiple_brands_in_url': 1 if brands_in_url >= 2 else 0,
607
+ 'brand_with_hyphen': self._brand_with_hyphen(domain_lower),
608
+ 'suspicious_brand_tld': self._suspicious_brand_tld(domain),
609
+ 'brand_keyword_combo': self._brand_phishing_keyword_combo(url_lower),
610
+ }
611
+
612
+ def _encoding_features(self, url: str, domain: str) -> dict:
613
+ """Encoding-related features."""
614
+ has_punycode = 'xn--' in domain
615
+
616
+ try:
617
+ decoded = unquote(url)
618
+ encoding_diff = len(decoded) - len(url)
619
+ except:
620
+ encoding_diff = 0
621
+
622
+ try:
623
+ has_hex = 1 if re.search(r'[0-9a-f]{20,}', url.lower()) else 0
624
+ except:
625
+ has_hex = 0
626
+
627
+ try:
628
+ has_base64 = 1 if re.search(r'[A-Za-z0-9+/]{30,}={0,2}', url) else 0
629
+ except:
630
+ has_base64 = 0
631
+
632
+ try:
633
+ has_unicode = 1 if any(ord(c) > 127 for c in url) else 0
634
+ except:
635
+ has_unicode = 0
636
+
637
+ return {
638
+ 'has_url_encoding': 1 if '%' in url else 0,
639
+ 'encoding_count': url.count('%'),
640
+ 'encoding_diff': abs(encoding_diff),
641
+ 'has_punycode': 1 if has_punycode else 0,
642
+ 'has_unicode': has_unicode,
643
+ 'has_hex_string': has_hex,
644
+ 'has_base64': has_base64,
645
+
646
+ # Homograph & encoding detection
647
+ 'has_lookalike_chars': self._detect_lookalike_chars(domain),
648
+ 'mixed_script_score': self._mixed_script_detection(domain),
649
+ 'homograph_brand_risk': self._homograph_brand_check(domain),
650
+ 'suspected_idn_homograph': self._idn_homograph_score(url),
651
+ 'double_encoding': self._detect_double_encoding(url),
652
+ 'encoding_in_domain': 1 if '%' in domain else 0,
653
+ 'suspicious_unicode_category': self._suspicious_unicode_chars(url),
654
+ }
655
+
656
+ # ============================================================
657
+ # HELPER METHODS
658
+ # ============================================================
659
+
660
+ def _entropy(self, text: str) -> float:
661
+ """Calculate Shannon entropy."""
662
+ if not text:
663
+ return 0.0
664
+ freq = Counter(text)
665
+ length = len(text)
666
+ return -sum((c / length) * math.log2(c / length) for c in freq.values())
667
+
668
+ def _max_consecutive(self, text: str, condition) -> int:
669
+ """Max consecutive characters matching condition."""
670
+ max_count = count = 0
671
+ for char in text:
672
+ if condition(char):
673
+ count += 1
674
+ max_count = max(max_count, count)
675
+ else:
676
+ count = 0
677
+ return max_count
678
+
679
+ def _max_consecutive_consonants(self, text: str) -> int:
680
+ """Max consecutive consonants."""
681
+ consonants = set('bcdfghjklmnpqrstvwxyz')
682
+ max_count = count = 0
683
+ for char in text.lower():
684
+ if char in consonants:
685
+ count += 1
686
+ max_count = max(max_count, count)
687
+ else:
688
+ count = 0
689
+ return max_count
690
+
691
+ def _repeat_rate(self, text: str) -> float:
692
+ """Rate of repeated adjacent characters."""
693
+ if len(text) < 2:
694
+ return 0.0
695
+ repeats = sum(1 for i in range(len(text) - 1) if text[i] == text[i + 1])
696
+ return repeats / (len(text) - 1)
697
+
698
+ def _unique_ngram_ratio(self, text: str, n: int) -> float:
699
+ """Ratio of unique n-grams to total n-grams."""
700
+ if len(text) < n:
701
+ return 0.0
702
+ ngrams = [text[i:i + n] for i in range(len(text) - n + 1)]
703
+ return len(set(ngrams)) / len(ngrams)
704
+
705
+ def _is_ip(self, domain: str) -> bool:
706
+ """Check if domain is IP address."""
707
+ # IPv4
708
+ if re.match(r'^(\d{1,3}\.){3}\d{1,3}$', domain):
709
+ return True
710
+ # IPv6
711
+ try:
712
+ socket.inet_pton(socket.AF_INET6, domain.strip('[]'))
713
+ return True
714
+ except:
715
+ return False
716
+
717
+ # ============================================================
718
+ # NEW/IMPROVED METHODS
719
+ # ============================================================
720
+
721
+ def _is_url_shortener(self, domain: str) -> int:
722
+ """
723
+ URL shortener detection - EXACT match.
724
+ """
725
+ domain_lower = domain.lower()
726
+ return 1 if domain_lower in self.shorteners else 0
727
+
728
+ def _is_free_platform(self, domain: str) -> int:
729
+ """
730
+ Detect if hosted on free platform.
731
+ CRITICAL FIX: Exact or suffix match (not substring!).
732
+
733
+ Examples:
734
+ - 'mysite.weebly.com' → 1 (suffix match)
735
+ - 'weebly.com' → 1 (exact match)
736
+ - 'weebly-alternative.com' → 0 (NOT a match!)
737
+ """
738
+ domain_lower = domain.lower()
739
+
740
+ # Exact match
741
+ if domain_lower in self.free_platforms:
742
+ return 1
743
+
744
+ if domain_lower in self.google_services:
745
+ return 1
746
+
747
+ if domain_lower in self.adobe_services:
748
+ return 1
749
+
750
+ if domain_lower in self.microsoft_services:
751
+ return 1
752
+
753
+ if domain_lower in self.zoom_services:
754
+ return 1
755
+
756
+ # Suffix match (subdomain.platform.com)
757
+ for platform in self.free_platforms:
758
+ if domain_lower.endswith('.' + platform):
759
+ return 1
760
+
761
+ return 0
762
+
763
+ def _get_platform_subdomain_length(self, domain: str) -> int:
764
+ """
765
+ IMPROVED: Handle multi-level subdomains.
766
+
767
+ Examples:
768
+ - docs.google.com → subdomain = 'docs' (4 chars)
769
+ - new.express.adobe.com → subdomain = 'new.express' (11 chars)
770
+ - storage.cloud.google.com → subdomain = 'storage.cloud' (13 chars)
771
+ """
772
+ domain_lower = domain.lower()
773
+
774
+ # Check Google
775
+ if '.google.com' in domain_lower:
776
+ subdomain = domain_lower.replace('.google.com', '')
777
+ return len(subdomain)
778
+
779
+ # Check Adobe
780
+ if '.adobe.com' in domain_lower:
781
+ subdomain = domain_lower.replace('.adobe.com', '')
782
+ return len(subdomain)
783
+
784
+ # Check Microsoft
785
+ if '.office.com' in domain_lower:
786
+ subdomain = domain_lower.replace('.office.com', '')
787
+ return len(subdomain)
788
+
789
+ # Check free platforms (existing logic)
790
+ for platform in self.free_platforms:
791
+ if domain_lower.endswith('.' + platform):
792
+ subdomain = domain_lower[:-len('.' + platform)]
793
+ return len(subdomain)
794
+
795
+ return 0
796
+
797
+ def _detect_uuid_pattern(self, domain: str) -> int:
798
+ """
799
+ Detect UUID patterns in subdomain (Replit, Firebase, etc.).
800
+
801
+ Example:
802
+ 'b82dba2b-fde4-4477-b6d5-8b17144e1bee.replit.dev' → 1
803
+ """
804
+ # UUID pattern: 8-4-4-4-12 hex characters
805
+ uuid_pattern = r'[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}'
806
+
807
+ return 1 if re.search(uuid_pattern, domain.lower()) else 0
808
+
809
+ # ============================================================
810
+ # DOMAIN RANDOMNESS HELPERS
811
+ # ============================================================
812
+
813
+ def _calculate_domain_randomness(self, domain: str) -> float:
814
+ """Calculate randomness score for domain (0-1)."""
815
+ if not domain or len(domain) < 4:
816
+ return 0.5
817
+
818
+ domain_lower = domain.lower()
819
+ scores = []
820
+
821
+ # 1. Vowel distribution
822
+ vowels = 'aeiou'
823
+ vowel_positions = [i for i, c in enumerate(domain_lower) if c in vowels]
824
+ if len(vowel_positions) >= 2:
825
+ avg_gap = sum(vowel_positions[i+1] - vowel_positions[i]
826
+ for i in range(len(vowel_positions)-1)) / (len(vowel_positions)-1)
827
+ vowel_irregularity = min(abs(avg_gap - 2.5) / 5, 1.0)
828
+ scores.append(vowel_irregularity)
829
+
830
+ # 2. Character frequency
831
+ char_freq = Counter(domain_lower)
832
+ common_letters = 'etaoinshr'
833
+ common_count = sum(char_freq.get(c, 0) for c in common_letters)
834
+ uncommon_ratio = 1 - (common_count / max(len(domain_lower), 1))
835
+ scores.append(uncommon_ratio)
836
+
837
+ # 3. Repeated characters
838
+ unique_ratio = len(set(domain_lower)) / max(len(domain_lower), 1)
839
+ if unique_ratio > 0.75:
840
+ scores.append((unique_ratio - 0.75) / 0.25)
841
+ else:
842
+ scores.append(0)
843
+
844
+ return min(sum(scores) / max(len(scores), 1), 1.0)
845
+
846
+ def _consonant_clustering_score(self, text: str) -> float:
847
+ """Detect unnatural consonant clusters."""
848
+ if not text:
849
+ return 0
850
+
851
+ text_lower = text.lower()
852
+ consonants = 'bcdfghjklmnpqrstvwxyz'
853
+
854
+ max_cluster = 0
855
+ current_cluster = 0
856
+
857
+ for char in text_lower:
858
+ if char in consonants:
859
+ current_cluster += 1
860
+ max_cluster = max(max_cluster, current_cluster)
861
+ else:
862
+ current_cluster = 0
863
+
864
+ if max_cluster >= 5:
865
+ return 1.0
866
+ elif max_cluster >= 4:
867
+ return 0.7
868
+ elif max_cluster >= 3:
869
+ return 0.4
870
+ else:
871
+ return 0.0
872
+
873
+ def _keyboard_pattern_score(self, text: str) -> int:
874
+ """Detect keyboard walking patterns."""
875
+ if not text:
876
+ return 0
877
+
878
+ text_lower = text.lower()
879
+ count = 0
880
+
881
+ for pattern in self.keyboard_patterns:
882
+ if pattern in text_lower:
883
+ count += 1
884
+
885
+ return count
886
+
887
+ def _contains_dictionary_word(self, text: str) -> int:
888
+ """Check if text contains any common English word."""
889
+ if not text or len(text) < 4:
890
+ return 0
891
+
892
+ text_lower = text.lower()
893
+
894
+ if text_lower in self.common_words:
895
+ return 1
896
+
897
+ for word in self.common_words:
898
+ if len(word) >= 4 and word in text_lower:
899
+ return 1
900
+
901
+ return 0
902
+
903
+ def _pronounceability_score(self, text: str) -> float:
904
+ """Score based on bigram frequencies in English."""
905
+ if not text or len(text) < 2:
906
+ return 0.5
907
+
908
+ text_lower = text.lower()
909
+
910
+ common_bigrams = {
911
+ 'th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd',
912
+ 'ti', 'es', 'or', 'te', 'of', 'ed', 'is', 'it', 'al', 'ar',
913
+ 'st', 'to', 'nt', 'ng', 'se', 'ha', 'as', 'ou', 'io', 've'
914
+ }
915
+
916
+ bigrams = [text_lower[i:i+2] for i in range(len(text_lower)-1)]
917
+
918
+ if not bigrams:
919
+ return 0.5
920
+
921
+ common_count = sum(1 for bg in bigrams if bg in common_bigrams)
922
+ score = common_count / len(bigrams)
923
+
924
+ return score
925
+
926
+ def _suspicious_digit_position(self, text: str) -> int:
927
+ """Detect suspicious digit positions."""
928
+ if not text:
929
+ return 0
930
+
931
+ if text and text[0].isdigit():
932
+ return 1
933
+
934
+ if len(text) >= 2 and text[-1].isdigit() and text[-2].isdigit():
935
+ return 1
936
+
937
+ return 0
938
+
939
+ # ============================================================
940
+ # BRAND SPOOFING HELPERS
941
+ # ============================================================
942
+
943
+ def _brand_subdomain_spoofing(self, parsed) -> int:
944
+ """Detect brand in subdomain but not main domain."""
945
+ try:
946
+ parts = parsed.netloc.split('.')
947
+ if len(parts) < 3:
948
+ return 0
949
+
950
+ subdomains = '.'.join(parts[:-2]).lower()
951
+ main_domain = '.'.join(parts[-2:]).lower()
952
+
953
+ for brand in self.brand_names:
954
+ if brand in subdomains and brand not in main_domain:
955
+ return 1
956
+
957
+ return 0
958
+ except:
959
+ return 0
960
+
961
+ def _brand_with_hyphen(self, domain: str) -> int:
962
+ """Detect hyphenated brand names."""
963
+ if not domain:
964
+ return 0
965
+
966
+ domain_lower = domain.lower()
967
+
968
+ for brand in self.brand_names:
969
+ if len(brand) >= 4:
970
+ for i in range(1, len(brand)):
971
+ hyphenated = brand[:i] + '-' + brand[i:]
972
+ if hyphenated in domain_lower:
973
+ return 1
974
+
975
+ return 0
976
+
977
+ def _suspicious_brand_tld(self, domain: str) -> int:
978
+ """Detect brand name with suspicious TLD."""
979
+ if not domain:
980
+ return 0
981
+
982
+ domain_lower = domain.lower()
983
+ parts = domain_lower.split('.')
984
+
985
+ if len(parts) < 2:
986
+ return 0
987
+
988
+ tld = parts[-1]
989
+ domain_without_tld = '.'.join(parts[:-1])
990
+
991
+ if tld in self.suspicious_tlds:
992
+ for brand in self.brand_names:
993
+ if brand in domain_without_tld:
994
+ return 1
995
+
996
+ return 0
997
+
998
+ def _brand_phishing_keyword_combo(self, url: str) -> int:
999
+ """Detect brand + phishing keyword combination."""
1000
+ if not url:
1001
+ return 0
1002
+
1003
+ url_lower = url.lower()
1004
+
1005
+ has_brand = any(brand in url_lower for brand in self.brand_names)
1006
+
1007
+ if has_brand:
1008
+ phishing_combo_keywords = [
1009
+ 'verify', 'security', 'secure', 'account', 'update',
1010
+ 'login', 'confirm', 'suspended', 'locked'
1011
+ ]
1012
+ for keyword in phishing_combo_keywords:
1013
+ if keyword in url_lower:
1014
+ return 1
1015
+
1016
+ return 0
1017
+
1018
+ # ============================================================
1019
+ # PATH & QUERY HELPERS
1020
+ # ============================================================
1021
+
1022
+ def _brand_in_path_only(self, path: str, domain: str) -> int:
1023
+ """Detect brand in path but not in domain."""
1024
+ if not path or not domain:
1025
+ return 0
1026
+
1027
+ path_lower = path.lower()
1028
+ domain_lower = domain.lower()
1029
+
1030
+ for brand in self.brand_names:
1031
+ if brand in path_lower and brand not in domain_lower:
1032
+ return 1
1033
+
1034
+ return 0
1035
+
1036
+ def _suspicious_extension_pattern(self, path: str) -> int:
1037
+ """Detect suspicious extension patterns."""
1038
+ if not path:
1039
+ return 0
1040
+
1041
+ path_lower = path.lower()
1042
+
1043
+ suspicious_patterns = [
1044
+ '.php.exe', '.html.exe', '.pdf.exe', '.doc.exe',
1045
+ '.zip.exe', '.rar.exe', '.html.zip', '.pdf.scr'
1046
+ ]
1047
+
1048
+ for pattern in suspicious_patterns:
1049
+ if pattern in path_lower:
1050
+ return 1
1051
+
1052
+ parts = path_lower.split('.')
1053
+ if len(parts) >= 3:
1054
+ ext1 = parts[-2]
1055
+ ext2 = parts[-1]
1056
+
1057
+ doc_exts = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'html', 'htm']
1058
+ exec_exts = ['exe', 'scr', 'bat', 'cmd', 'com', 'pif']
1059
+
1060
+ if ext1 in doc_exts and ext2 in exec_exts:
1061
+ return 1
1062
+
1063
+ return 0
1064
+
1065
+ # ============================================================
1066
+ # ENCODING HELPERS
1067
+ # ============================================================
1068
+
1069
+ def _detect_lookalike_chars(self, domain: str) -> int:
1070
+ """Detect lookalike characters."""
1071
+ if not domain:
1072
+ return 0
1073
+
1074
+ domain_lower = domain.lower()
1075
+
1076
+ suspicious_patterns = [
1077
+ ('rn', 'm'),
1078
+ ('vv', 'w'),
1079
+ ('cl', 'd'),
1080
+ ]
1081
+
1082
+ for pattern, _ in suspicious_patterns:
1083
+ if pattern in domain_lower:
1084
+ return 1
1085
+
1086
+ if any(c in domain_lower for c in ['0', '1']):
1087
+ has_letters = any(c.isalpha() for c in domain_lower)
1088
+ if has_letters:
1089
+ for lookalike_char in self.lookalike_chars:
1090
+ if lookalike_char in domain_lower:
1091
+ return 1
1092
+
1093
+ return 0
1094
+
1095
+ def _mixed_script_detection(self, domain: str) -> int:
1096
+ """Detect mixing of scripts."""
1097
+ if not domain:
1098
+ return 0
1099
+
1100
+ scripts = set()
1101
+
1102
+ for char in domain:
1103
+ if char.isalpha():
1104
+ try:
1105
+ script = unicodedata.name(char).split()[0]
1106
+ if script in ['LATIN', 'CYRILLIC', 'GREEK']:
1107
+ scripts.add(script)
1108
+ except:
1109
+ pass
1110
+
1111
+ return len(scripts) if len(scripts) > 1 else 0
1112
+
1113
+ def _homograph_brand_check(self, domain: str) -> int:
1114
+ """Check for homograph attacks on brands."""
1115
+ if not domain:
1116
+ return 0
1117
+
1118
+ domain_lower = domain.lower()
1119
+ top_brands = ['paypal', 'apple', 'amazon', 'google', 'microsoft', 'facebook']
1120
+
1121
+ for brand in top_brands:
1122
+ if len(domain_lower) < len(brand) - 2 or len(domain_lower) > len(brand) + 2:
1123
+ continue
1124
+
1125
+ differences = 0
1126
+ for i in range(min(len(domain_lower), len(brand))):
1127
+ if i < len(domain_lower) and i < len(brand):
1128
+ if domain_lower[i] != brand[i]:
1129
+ if (domain_lower[i] in '01' and brand[i] in 'ol') or \
1130
+ (domain_lower[i] in 'ol' and brand[i] in '01'):
1131
+ differences += 1
1132
+ else:
1133
+ differences += 1
1134
+
1135
+ if differences <= 2 and differences > 0:
1136
+ return 1
1137
+
1138
+ return 0
1139
+
1140
+ def _idn_homograph_score(self, url: str) -> float:
1141
+ """Combined IDN homograph attack score."""
1142
+ score = 0.0
1143
+ count = 0
1144
+
1145
+ if 'xn--' in url.lower():
1146
+ score += 0.5
1147
+ count += 1
1148
+
1149
+ non_ascii = sum(1 for c in url if ord(c) > 127)
1150
+ if non_ascii > 0:
1151
+ score += min(non_ascii / 10, 0.3)
1152
+ count += 1
1153
+
1154
+ return score / max(count, 1) if count > 0 else 0.0
1155
+
1156
+ def _detect_double_encoding(self, url: str) -> int:
1157
+ """Detect double URL encoding."""
1158
+ if not url:
1159
+ return 0
1160
+
1161
+ double_encoded_patterns = ['%25', '%2520', '%252e', '%252f']
1162
+ count = sum(url.lower().count(pattern) for pattern in double_encoded_patterns)
1163
+
1164
+ return count
1165
+
1166
+ def _suspicious_unicode_chars(self, url: str) -> int:
1167
+ """Detect uncommon Unicode categories."""
1168
+ if not url:
1169
+ return 0
1170
+
1171
+ suspicious_count = 0
1172
+
1173
+ for char in url:
1174
+ try:
1175
+ category = unicodedata.category(char)
1176
+ if category in ['Mn', 'Mc', 'Me', 'Zl', 'Zp',
1177
+ 'Cc', 'Cf', 'Sm', 'Sc', 'Sk', 'So']:
1178
+ suspicious_count += 1
1179
+ except:
1180
+ pass
1181
+
1182
+ return suspicious_count
1183
+
1184
+ # ============================================================
1185
+ # FEATURE REFINEMENT HELPERS
1186
+ # ============================================================
1187
+
1188
+ def _categorize_length(self, length: int, thresholds: list) -> int:
1189
+ """Multi-category encoding for length features."""
1190
+ for i, threshold in enumerate(thresholds):
1191
+ if length <= threshold:
1192
+ return i
1193
+ return len(thresholds)
1194
+
1195
+ def _categorize_extension(self, extension: str) -> int:
1196
+ """
1197
+ Categorize file extension:
1198
+ 0 = none
1199
+ 1 = document
1200
+ 2 = web/script
1201
+ 3 = executable
1202
+ 4 = archive
1203
+ 5 = image
1204
+ 6 = other
1205
+ """
1206
+ if not extension:
1207
+ return 0
1208
+
1209
+ ext_lower = extension.lower()
1210
+
1211
+ if ext_lower in ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', 'txt', 'rtf']:
1212
+ return 1
1213
+
1214
+ if ext_lower in ['html', 'htm', 'php', 'asp', 'aspx', 'jsp', 'js', 'css']:
1215
+ return 2
1216
+
1217
+ if ext_lower in ['exe', 'bat', 'cmd', 'scr', 'msi', 'com', 'pif', 'app', 'apk']:
1218
+ return 3
1219
+
1220
+ if ext_lower in ['zip', 'rar', '7z', 'tar', 'gz', 'bz2']:
1221
+ return 4
1222
+
1223
+ if ext_lower in ['jpg', 'jpeg', 'png', 'gif', 'svg', 'ico', 'webp']:
1224
+ return 5
1225
+
1226
+ return 6
1227
+
1228
+ def _character_diversity(self, text: str) -> float:
1229
+ """Shannon diversity index for characters."""
1230
+ if not text:
1231
+ return 0.0
1232
+
1233
+ unique_chars = len(set(text))
1234
+ return min(unique_chars / max(len(text), 1), 1.0)
1235
+
1236
+ def _calculate_url_complexity(self, url: str) -> float:
1237
+ """Combined URL complexity score."""
1238
+ if not url:
1239
+ return 0.0
1240
+
1241
+ special_chars = sum(1 for c in url if not c.isalnum() and c not in [':', '/', '.'])
1242
+ special_ratio = special_chars / max(len(url), 1)
1243
+
1244
+ length_score = min(len(url) / 200, 1.0)
1245
+
1246
+ encoding_score = min(url.count('%') / 10, 1.0)
1247
+
1248
+ complexity = (special_ratio * 0.4 + length_score * 0.3 + encoding_score * 0.3)
1249
+
1250
+ return min(complexity, 1.0)
1251
+
1252
+ # ============================================================
1253
+ # UTILITY METHODS
1254
+ # ============================================================
1255
+
1256
+ def _get_default_features(self) -> dict:
1257
+ """Default feature values for error cases."""
1258
+ # Get feature names dynamically
1259
+ dummy_url = "http://example.com"
1260
+ try:
1261
+ return self.extract_features(dummy_url)
1262
+ except:
1263
+ return {}
1264
+
1265
+ def get_feature_names(self) -> list:
1266
+ """
1267
+ Get list of all feature names DYNAMICALLY.
1268
+ FIXED: No longer hardcoded!
1269
+ """
1270
+ dummy_url = "http://example.com/test"
1271
+ dummy_features = self.extract_features(dummy_url)
1272
+
1273
+ # Remove 'label' if present
1274
+ feature_names = [k for k in dummy_features.keys() if k != 'label']
1275
+
1276
+ return sorted(feature_names)
1277
+
1278
+ def extract_batch(self, urls: list, show_progress: bool = True) -> pd.DataFrame:
1279
+ """
1280
+ Extract features from multiple URLs.
1281
+
1282
+ Args:
1283
+ urls: List of URL strings
1284
+ show_progress: Show progress messages
1285
+
1286
+ Returns:
1287
+ DataFrame with features
1288
+ """
1289
+ if show_progress:
1290
+ logger.info(f"Extracting URL features from {len(urls):,} URLs...")
1291
+
1292
+ features_list = []
1293
+ progress_interval = 50000
1294
+
1295
+ for i, url in enumerate(urls):
1296
+ if show_progress and i > 0 and i % progress_interval == 0:
1297
+ logger.info(f" Processed {i:,} / {len(urls):,} ({100 * i / len(urls):.1f}%)")
1298
+
1299
+ features = self.extract_features(url)
1300
+ features_list.append(features)
1301
+
1302
+ df = pd.DataFrame(features_list)
1303
+
1304
+ if show_progress:
1305
+ logger.info(f"✓ Extracted {len(df.columns)} features from {len(df):,} URLs")
1306
+
1307
+ return df
1308
+
1309
+
1310
+ def main():
1311
+ """Extract URL-only features from dataset."""
1312
+ import argparse
1313
+
1314
+ parser = argparse.ArgumentParser(description='URL-Only Feature Extraction v2.1 (IMPROVED)')
1315
+ parser.add_argument('--sample', type=int, default=None, help='Sample N URLs')
1316
+ parser.add_argument('--output', type=str, default=None, help='Output filename')
1317
+ args = parser.parse_args()
1318
+
1319
+ logger.info("=" * 70)
1320
+ logger.info("URL-Only Feature Extraction v2")
1321
+ logger.info("=" * 70)
1322
+ logger.info("")
1323
+ logger.info("NEW Features:")
1324
+ logger.info(" - Fixed free platform detection (exact/suffix match)")
1325
+ logger.info(" - Added platform_subdomain_length")
1326
+ logger.info(" - Added has_uuid_subdomain")
1327
+ logger.info(" - Added longest_part thresholds (gt_20, gt_30, gt_40)")
1328
+ logger.info(" - Expanded brand list with regional brands")
1329
+ logger.info(" - Improved extension categorization")
1330
+ logger.info("")
1331
+
1332
+ # Load dataset
1333
+ script_dir = Path(__file__).parent
1334
+ data_file = (script_dir / '../../data/processed/clean_dataset.csv').resolve()
1335
+
1336
+ logger.info(f"Loading: {data_file.name}")
1337
+ df = pd.read_csv(data_file)
1338
+ logger.info(f"Loaded: {len(df):,} URLs")
1339
+
1340
+ if args.sample and args.sample < len(df):
1341
+ df = df.sample(n=args.sample, random_state=42)
1342
+ logger.info(f"Sampled: {len(df):,} URLs")
1343
+
1344
+ # Extract features
1345
+ extractor = URLFeatureExtractorV2()
1346
+ features_df = extractor.extract_batch(df['url'].tolist())
1347
+ features_df['label'] = df['label'].values
1348
+
1349
+ # Save
1350
+ output_dir = (script_dir / '../../data/features').resolve()
1351
+ output_dir.mkdir(parents=True, exist_ok=True)
1352
+
1353
+ if args.output:
1354
+ output_file = output_dir / args.output
1355
+ else:
1356
+ suffix = f'_sample{args.sample}' if args.sample else ''
1357
+ output_file = output_dir / f'url_features_v2{suffix}.csv'
1358
+
1359
+ features_df.to_csv(output_file, index=False)
1360
+
1361
+ logger.info("")
1362
+ logger.info("=" * 70)
1363
+ logger.info(f"✓ Saved: {output_file}")
1364
+ logger.info(f" Shape: {features_df.shape}")
1365
+ logger.info(f" Features: {len(features_df.columns) - 1}")
1366
+ logger.info("=" * 70)
1367
+
1368
+ # Show feature names
1369
+ print("\nAll Features:")
1370
+ feature_names = extractor.get_feature_names()
1371
+ for i, name in enumerate(feature_names, 1):
1372
+ print(f"{i:3d}. {name}")
1373
+
1374
+ # Show stats
1375
+ print("\n\nFeature Statistics (first 30):")
1376
+ print(features_df.describe().T.head(30))
1377
+
1378
+ # Show new features stats
1379
+ print("\n\nNEW FEATURES Statistics:")
1380
+ new_features = [
1381
+ 'is_free_platform', 'platform_subdomain_length', 'has_uuid_subdomain',
1382
+ 'longest_part_gt_20', 'longest_part_gt_30', 'longest_part_gt_40'
1383
+ ]
1384
+ for feat in new_features:
1385
+ if feat in features_df.columns:
1386
+ if feat == 'platform_subdomain_length':
1387
+ print(f"\n{feat}:")
1388
+ print(f" Mean: {features_df[feat].mean():.2f}")
1389
+ print(f" Max: {features_df[feat].max()}")
1390
+ print(f" Non-zero: {(features_df[feat] > 0).sum()} ({(features_df[feat] > 0).sum() / len(features_df) * 100:.1f}%)")
1391
+ else:
1392
+ print(f"\n{feat}: {features_df[feat].sum()} / {len(features_df)} ({features_df[feat].mean() * 100:.1f}%)")
1393
+
1394
+
1395
+ if __name__ == "__main__":
1396
+ main()
scripts/feature_extraction/url/url_features_v3.py ADDED
@@ -0,0 +1,866 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Feature Extraction v2.2 - OPTIMIZED & NORMALIZED
3
+
4
+ KEY IMPROVEMENTS:
5
+ 1. ✅ URL Normalization - www.github.com & github.com produce identical features
6
+ 2. ✅ Scheme normalization - http/https handled consistently
7
+ 3. ✅ Removed redundant features (www_in_middle, www_subdomain_only, etc.)
8
+ 4. ✅ Separated is_free_hosting from is_free_platform (both are important!)
9
+ 5. ✅ Focus on TOP 20 most important features from model analysis
10
+ 6. ✅ Optimized for production use (< 1ms per URL)
11
+
12
+ TOP FEATURES (from your model analysis):
13
+ - domain_dots (XGB: 27.7%, RF: 4.4%)
14
+ - is_shortened (XGB: 24.7%, RF: 6.2%)
15
+ - is_free_hosting (XGB: 10.6%, RF: 4.0%)
16
+ - is_free_platform (XGB: 9.2%, RF: 4.1%)
17
+ - num_subdomains (XGB: 4.2%, RF: 5.2%)
18
+ - domain_length (XGB: 0.8%, RF: 5.0%)
19
+ - domain_entropy, url_entropy, path features...
20
+ """
21
+ import re
22
+ import math
23
+ import argparse
24
+ import logging
25
+ import pandas as pd
26
+ from urllib.parse import urlparse, parse_qs, unquote
27
+ from collections import Counter
28
+ from pathlib import Path
29
+
30
+ # Setup logging
31
+ logging.basicConfig(
32
+ level=logging.INFO,
33
+ format='%(asctime)s - %(levelname)s - %(message)s',
34
+ datefmt='%H:%M:%S'
35
+ )
36
+ logger = logging.getLogger("url_features_optimized")
37
+
38
+
39
+ class URLFeatureExtractorOptimized:
40
+ """
41
+ Optimized URL feature extractor with normalization.
42
+
43
+ KEY: Normalizes www/http variants for consistent features!
44
+ """
45
+
46
+ def __init__(self):
47
+ """Initialize with keyword lists - OPTIMIZED"""
48
+
49
+ # Phishing keywords (top indicators)
50
+ self.phishing_keywords = [
51
+ 'login', 'signin', 'account', 'update', 'verify', 'secure',
52
+ 'banking', 'confirm', 'password', 'suspended', 'authenticate',
53
+ 'wallet', 'payment', 'billing', 'expire', 'urgent', 'alert'
54
+ ]
55
+
56
+ # Brand names (expanded with regional)
57
+ self.brand_names = [
58
+ 'paypal', 'ebay', 'amazon', 'apple', 'microsoft', 'google',
59
+ 'facebook', 'instagram', 'twitter', 'x', 'netflix', 'linkedin',
60
+ 'dropbox', 'adobe', 'spotify', 'steam', 'zoom', 'docusign',
61
+ 'chase', 'wellsfargo', 'bankofamerica', 'citibank', 'citi',
62
+ 'visa', 'mastercard', 'amex', 'capitalone',
63
+ 'outlook', 'office365', 'gmail', 'yahoo', 'icloud', 'whatsapp',
64
+ 'dhl', 'fedex', 'ups', 'usps', 'alibaba',
65
+ 'coinbase', 'binance', 'blockchain', 'metamask', 'stripe',
66
+ 'tiktok', 'snapchat', 'roblox'
67
+ ]
68
+
69
+ # URL shorteners - EXACT match only
70
+ self.shorteners = {
71
+ 'bit.ly', 'bitly.com', 'goo.gl', 'tinyurl.com', 't.co', 'ow.ly',
72
+ 'is.gd', 'buff.ly', 'adf.ly', 'short.to', 'tiny.cc', 'rb.gy',
73
+ 'cutt.ly', 'qrco.de', 'linktr.ee', 'linkin.bio'
74
+ }
75
+
76
+ # Suspicious TLDs
77
+ self.suspicious_tlds = {
78
+ 'tk', 'ml', 'ga', 'cf', 'gq', 'xyz', 'top', 'club', 'work',
79
+ 'date', 'loan', 'download', 'click', 'link', 'zip', 'mov'
80
+ }
81
+
82
+ # Trusted TLDs
83
+ self.trusted_tlds = {
84
+ 'com', 'org', 'net', 'edu', 'gov', 'mil', 'uk', 'us', 'ca',
85
+ 'de', 'fr', 'jp', 'au', 'nl', 'it', 'es'
86
+ }
87
+
88
+ # FREE HOSTING - separate from platforms!
89
+ self.free_hosting = {
90
+ '000webhostapp.com', 'freehosting.com', 'freehostia.com',
91
+ '5gbfree.com', 'x10hosting.com', 'awardspace.com',
92
+ 'byet.host', 'infinityfree.com', 'webcindario.com'
93
+ }
94
+
95
+ # FREE PLATFORMS - frequently abused for phishing
96
+ self.free_platforms = {
97
+ # Website builders
98
+ 'weebly.com', 'wixsite.com', 'wix.com', 'webflow.io',
99
+ 'carrd.co', 'notion.site', 'webwave.me', 'godaddysites.com',
100
+ 'square.site', 'sites.google.com',
101
+ # Cloud platforms
102
+ 'firebaseapp.com', 'web.app', 'appspot.com',
103
+ 'github.io', 'gitlab.io', 'vercel.app', 'netlify.app',
104
+ 'replit.dev', 'repl.co', 'glitch.me', 'herokuapp.com',
105
+ 'onrender.com', 'railway.app', 'fly.dev', 'pages.dev',
106
+ # Blogging
107
+ 'wordpress.com', 'blogspot.com', 'blogger.com', 'tumblr.com',
108
+ # Forms/docs
109
+ 'jotform.com', 'typeform.com', 'forms.gle',
110
+ # File sharing
111
+ 'dropboxusercontent.com', 'sharepoint.com', '1drv.ms'
112
+ }
113
+
114
+ # Common English words
115
+ self.common_words = {
116
+ 'about', 'account', 'after', 'all', 'also', 'app', 'apple', 'area',
117
+ 'back', 'bank', 'best', 'book', 'business', 'call', 'can', 'card',
118
+ 'center', 'check', 'city', 'cloud', 'come', 'company', 'contact',
119
+ 'data', 'day', 'digital', 'email', 'file', 'find', 'first', 'free',
120
+ 'from', 'game', 'get', 'global', 'good', 'group', 'help', 'home',
121
+ 'info', 'just', 'keep', 'like', 'link', 'login', 'mail', 'main',
122
+ 'make', 'media', 'money', 'more', 'name', 'need', 'network', 'new',
123
+ 'news', 'next', 'office', 'online', 'only', 'open', 'page', 'pay',
124
+ 'people', 'phone', 'place', 'post', 'product', 'read', 'real',
125
+ 'search', 'secure', 'service', 'services', 'shop', 'sign', 'site',
126
+ 'start', 'support', 'system', 'tech', 'time', 'today', 'update',
127
+ 'user', 'verify', 'view', 'web', 'website', 'work', 'world'
128
+ }
129
+
130
+ # Keyboard patterns
131
+ self.keyboard_patterns = [
132
+ 'qwerty', 'asdfgh', 'zxcvbn', '12345', '123456', 'qwertyuiop'
133
+ ]
134
+
135
+ def normalize_url(self, url: str) -> tuple:
136
+ """
137
+ Normalize URL for consistent feature extraction.
138
+
139
+ CRITICAL: www.github.com and github.com should have same features!
140
+
141
+ Returns:
142
+ (normalized_url, original_domain, normalized_domain, is_http)
143
+ """
144
+ # Ensure scheme
145
+ if not url.startswith(('http://', 'https://')):
146
+ url = 'https://' + url
147
+
148
+ parsed = urlparse(url.lower())
149
+ original_domain = parsed.netloc.split(':')[0] # Remove port
150
+
151
+ # Normalize domain (remove www)
152
+ has_www = original_domain.startswith('www.')
153
+ normalized_domain = original_domain[4:] if has_www else original_domain
154
+
155
+ # Track if originally HTTP (security feature)
156
+ is_http = parsed.scheme == 'http'
157
+
158
+ # Rebuild URL with normalized domain and https
159
+ normalized_url = f"https://{normalized_domain}{parsed.path}"
160
+ if parsed.query:
161
+ normalized_url += f"?{parsed.query}"
162
+
163
+ return normalized_url, original_domain, normalized_domain, is_http
164
+
165
+ def extract_features(self, url: str) -> dict:
166
+ """
167
+ Extract features with URL normalization.
168
+
169
+ www.github.com and github.com produce IDENTICAL features!
170
+ """
171
+ try:
172
+ # Normalize URL
173
+ norm_url, orig_domain, norm_domain, is_http = self.normalize_url(url)
174
+
175
+ # Parse normalized URL
176
+ parsed = urlparse(norm_url)
177
+ domain = norm_domain
178
+ path = parsed.path
179
+ query = parsed.query
180
+
181
+ if not domain:
182
+ return self._get_default_features()
183
+
184
+ features = {}
185
+
186
+ # Extract all features using NORMALIZED URL/domain
187
+ features.update(self._length_features(norm_url, domain, path, query))
188
+ features.update(self._char_count_features(norm_url, domain, path))
189
+ features.update(self._ratio_features(norm_url, domain))
190
+ features.update(self._domain_features(domain, parsed))
191
+ features.update(self._path_features(path, domain))
192
+ features.update(self._query_features(query))
193
+ features.update(self._statistical_features(norm_url, domain, path))
194
+ features.update(self._security_features(norm_url, parsed, domain, is_http))
195
+ features.update(self._keyword_features(norm_url, domain, path))
196
+ features.update(self._encoding_features(norm_url, domain))
197
+
198
+ return features
199
+
200
+ except Exception as e:
201
+ logger.error(f"Error extracting features: {url[:50]}... Error: {e}")
202
+ return self._get_default_features()
203
+
204
+ # ============================================================
205
+ # FEATURE EXTRACTION METHODS
206
+ # ============================================================
207
+
208
+ def _length_features(self, url: str, domain: str, path: str, query: str) -> dict:
209
+ """Length-based features."""
210
+ url_len = len(url)
211
+ domain_len = len(domain)
212
+
213
+ return {
214
+ 'url_length': url_len,
215
+ 'domain_length': domain_len,
216
+ 'path_length': len(path),
217
+ 'query_length': len(query),
218
+ # Categorized lengths (0=short, 1=medium, 2=long, 3=very_long)
219
+ 'url_length_category': 0 if url_len < 40 else 1 if url_len < 75 else 2 if url_len < 120 else 3,
220
+ 'domain_length_category': 0 if domain_len < 10 else 1 if domain_len < 20 else 2 if domain_len < 30 else 3,
221
+ }
222
+
223
+ def _char_count_features(self, url: str, domain: str, path: str) -> dict:
224
+ """Character count features."""
225
+ return {
226
+ 'num_dots': url.count('.'),
227
+ 'num_hyphens': url.count('-'),
228
+ 'num_underscores': url.count('_'),
229
+ 'num_slashes': url.count('/'),
230
+ 'num_question_marks': url.count('?'),
231
+ 'num_ampersands': url.count('&'),
232
+ 'num_equals': url.count('='),
233
+ 'num_at': url.count('@'),
234
+ 'num_percent': url.count('%'),
235
+ 'num_digits_url': sum(c.isdigit() for c in url),
236
+ 'num_letters_url': sum(c.isalpha() for c in url),
237
+ # Domain-specific
238
+ 'domain_dots': domain.count('.'),
239
+ 'domain_hyphens': domain.count('-'),
240
+ 'domain_digits': sum(c.isdigit() for c in domain),
241
+ # Path-specific
242
+ 'path_slashes': path.count('/'),
243
+ 'path_dots': path.count('.'),
244
+ 'path_digits': sum(c.isdigit() for c in path),
245
+ }
246
+
247
+ def _ratio_features(self, url: str, domain: str) -> dict:
248
+ """Character ratio features."""
249
+ url_len = max(len(url), 1)
250
+ domain_len = max(len(domain), 1)
251
+
252
+ digit_count = sum(c.isdigit() for c in url)
253
+ letter_count = sum(c.isalpha() for c in url)
254
+ special_count = url_len - digit_count - letter_count
255
+
256
+ return {
257
+ 'digit_ratio_url': digit_count / url_len,
258
+ 'letter_ratio_url': letter_count / url_len,
259
+ 'special_char_ratio': special_count / url_len,
260
+ 'digit_ratio_domain': sum(c.isdigit() for c in domain) / domain_len,
261
+ 'symbol_ratio_domain': sum(c in '-_.' for c in domain) / domain_len,
262
+ }
263
+
264
+ def _domain_features(self, domain: str, parsed) -> dict:
265
+ """Domain structure features."""
266
+ parts = domain.split('.')
267
+ num_parts = len(parts)
268
+
269
+ # Subdomain count (e.g., sub.example.com = 1 subdomain)
270
+ num_subdomains = max(0, num_parts - 2) if num_parts >= 2 else 0
271
+
272
+ # TLD and SLD
273
+ tld = parts[-1] if parts else ''
274
+ sld = parts[-2] if len(parts) >= 2 else ''
275
+
276
+ # Domain part lengths
277
+ part_lens = [len(p) for p in parts]
278
+ longest_part = max(part_lens) if part_lens else 0
279
+ avg_part_len = sum(part_lens) / len(part_lens) if part_lens else 0
280
+
281
+ return {
282
+ 'num_subdomains': num_subdomains,
283
+ 'num_domain_parts': num_parts,
284
+ 'tld_length': len(tld),
285
+ 'sld_length': len(sld),
286
+ 'longest_domain_part': longest_part,
287
+ 'avg_domain_part_len': avg_part_len,
288
+ # Threshold flags
289
+ 'longest_part_gt_20': 1 if longest_part > 20 else 0,
290
+ 'longest_part_gt_30': 1 if longest_part > 30 else 0,
291
+ 'longest_part_gt_40': 1 if longest_part > 40 else 0,
292
+ # TLD checks
293
+ 'has_suspicious_tld': 1 if tld in self.suspicious_tlds else 0,
294
+ 'has_trusted_tld': 1 if tld in self.trusted_tlds else 0,
295
+ # Port
296
+ 'has_port': 1 if ':' in parsed.netloc else 0,
297
+ 'has_non_std_port': 1 if ':' in parsed.netloc and not parsed.netloc.endswith((':80', ':443')) else 0,
298
+ # Randomness
299
+ 'domain_randomness_score': self._calculate_domain_randomness(domain),
300
+ 'sld_consonant_cluster_score': self._consonant_clustering_score(sld),
301
+ 'sld_keyboard_pattern': self._keyboard_pattern_score(sld),
302
+ 'sld_has_dictionary_word': 1 if self._contains_dictionary_word(sld) else 0,
303
+ 'sld_pronounceability_score': self._pronounceability_score(sld),
304
+ 'domain_digit_position_suspicious': 1 if self._suspicious_digit_position(sld) else 0,
305
+ }
306
+
307
+ def _path_features(self, path: str, domain: str) -> dict:
308
+ """Path structure features."""
309
+ if not path or path == '/':
310
+ return {
311
+ 'path_depth': 0,
312
+ 'max_path_segment_len': 0,
313
+ 'avg_path_segment_len': 0.0,
314
+ 'has_extension': 0,
315
+ 'extension_category': 0,
316
+ 'has_suspicious_extension': 0,
317
+ 'has_exe': 0,
318
+ 'has_double_slash': 0,
319
+ 'path_has_brand_not_domain': 0,
320
+ 'path_has_ip_pattern': 0,
321
+ 'suspicious_path_extension_combo': 0,
322
+ }
323
+
324
+ segments = [s for s in path.split('/') if s]
325
+ depth = len(segments)
326
+
327
+ # Extension
328
+ has_ext = '.' in segments[-1] if segments else False
329
+ ext = segments[-1].split('.')[-1].lower() if has_ext else ''
330
+
331
+ # Check for suspicious extensions
332
+ exec_exts = {'exe', 'bat', 'cmd', 'scr', 'vbs', 'ps1'}
333
+ doc_exts = {'pdf', 'doc', 'docx', 'xls', 'xlsx'}
334
+
335
+ # Brand in path but not domain
336
+ path_brands = sum(1 for b in self.brand_names if b in path.lower())
337
+ domain_brands = sum(1 for b in self.brand_names if b in domain.lower())
338
+
339
+ return {
340
+ 'path_depth': depth,
341
+ 'max_path_segment_len': max((len(s) for s in segments), default=0),
342
+ 'avg_path_segment_len': sum(len(s) for s in segments) / depth if depth > 0 else 0,
343
+ 'has_extension': 1 if has_ext else 0,
344
+ 'extension_category': self._categorize_extension(ext),
345
+ 'has_suspicious_extension': 1 if ext in exec_exts else 0,
346
+ 'has_exe': 1 if ext == 'exe' else 0,
347
+ 'has_double_slash': 1 if '//' in path else 0,
348
+ 'path_has_brand_not_domain': 1 if path_brands > 0 and domain_brands == 0 else 0,
349
+ 'path_has_ip_pattern': 1 if re.search(r'\\d{1,3}[._-]\\d{1,3}[._-]\\d{1,3}[._-]\\d{1,3}', path) else 0,
350
+ 'suspicious_path_extension_combo': 1 if (ext in doc_exts and 'download' in path.lower()) else 0,
351
+ }
352
+
353
+ def _query_features(self, query: str) -> dict:
354
+ """Query string features."""
355
+ if not query:
356
+ return {
357
+ 'num_params': 0,
358
+ 'has_query': 0,
359
+ 'query_value_length': 0,
360
+ 'max_param_len': 0,
361
+ 'query_has_url': 0,
362
+ }
363
+
364
+ params = query.split('&')
365
+ param_values = [p.split('=')[1] if '=' in p else '' for p in params]
366
+
367
+ return {
368
+ 'num_params': len(params),
369
+ 'has_query': 1,
370
+ 'query_value_length': sum(len(v) for v in param_values),
371
+ 'max_param_len': max((len(p) for p in params), default=0),
372
+ 'query_has_url': 1 if any(v.startswith(('http', 'www')) for v in param_values) else 0,
373
+ }
374
+
375
+ def _statistical_features(self, url: str, domain: str, path: str) -> dict:
376
+ """Statistical features (entropy, patterns)."""
377
+ return {
378
+ 'url_entropy': self._entropy(url),
379
+ 'domain_entropy': self._entropy(domain),
380
+ 'path_entropy': self._entropy(path) if path else 0,
381
+ 'max_consecutive_digits': self._max_consecutive(url, str.isdigit),
382
+ 'max_consecutive_chars': self._max_consecutive(url, str.isalpha),
383
+ 'max_consecutive_consonants': self._max_consecutive_consonants(domain),
384
+ 'char_repeat_rate': self._repeat_rate(url),
385
+ 'unique_bigram_ratio': self._unique_ngram_ratio(url, 2),
386
+ 'unique_trigram_ratio': self._unique_ngram_ratio(url, 3),
387
+ 'sld_letter_diversity': self._character_diversity(domain.split('.')[-2] if '.' in domain else domain),
388
+ 'domain_has_numbers_letters': 1 if any(c.isdigit() for c in domain) and any(c.isalpha() for c in domain) else 0,
389
+ 'url_complexity_score': self._calculate_url_complexity(url),
390
+ }
391
+
392
+ def _security_features(self, url: str, parsed, domain: str, is_http: bool) -> dict:
393
+ """Security indicator features."""
394
+ return {
395
+ 'has_ip_address': 1 if self._is_ip(domain) else 0,
396
+ 'has_at_symbol': 1 if '@' in url else 0,
397
+ 'has_redirect': 1 if '//' in parsed.path else 0,
398
+
399
+ # CRITICAL FEATURES (from your top 20)
400
+ 'is_shortened': self._is_url_shortener(domain),
401
+ 'is_free_hosting': self._is_free_hosting(domain),
402
+ 'is_free_platform': self._is_free_platform(domain),
403
+ 'platform_subdomain_length': self._get_platform_subdomain_length(domain),
404
+ 'has_uuid_subdomain': self._detect_uuid_pattern(domain),
405
+
406
+ # HTTP vs HTTPS (from ORIGINAL URL)
407
+ 'is_http': 1 if is_http else 0,
408
+ }
409
+
410
+ def _keyword_features(self, url: str, domain: str, path: str) -> dict:
411
+ """Keyword and brand detection."""
412
+ url_lower = url.lower()
413
+ domain_lower = domain.lower()
414
+ path_lower = path.lower()
415
+
416
+ # Phishing keywords
417
+ phishing_count = sum(1 for k in self.phishing_keywords if k in url_lower)
418
+
419
+ # Brand mentions
420
+ brands_in_url = [b for b in self.brand_names if b in url_lower]
421
+
422
+ return {
423
+ 'num_phishing_keywords': phishing_count,
424
+ 'phishing_in_domain': 1 if any(k in domain_lower for k in self.phishing_keywords) else 0,
425
+ 'phishing_in_path': 1 if any(k in path_lower for k in self.phishing_keywords) else 0,
426
+ 'num_brands': len(brands_in_url),
427
+ 'brand_in_domain': 1 if any(b in domain_lower for b in self.brand_names) else 0,
428
+ 'brand_in_path': 1 if any(b in path_lower for b in self.brand_names) else 0,
429
+ 'brand_impersonation': self._brand_impersonation_score(domain, brands_in_url),
430
+ # Specific phishing keywords
431
+ 'has_login': 1 if 'login' in url_lower else 0,
432
+ 'has_account': 1 if 'account' in url_lower else 0,
433
+ 'has_verify': 1 if 'verify' in url_lower else 0,
434
+ 'has_secure': 1 if 'secure' in url_lower else 0,
435
+ 'has_update': 1 if 'update' in url_lower else 0,
436
+ 'has_bank': 1 if 'bank' in url_lower else 0,
437
+ 'has_password': 1 if 'password' in url_lower or 'passwd' in url_lower else 0,
438
+ 'has_suspend': 1 if 'suspend' in url_lower else 0,
439
+ 'has_webscr': 1 if 'webscr' in url_lower else 0,
440
+ 'has_cmd': 1 if 'cmd=' in url_lower or '/cmd/' in url_lower else 0,
441
+ 'has_cgi': 1 if 'cgi-bin' in url_lower or '.cgi' in url_lower else 0,
442
+ # Brand spoofing patterns
443
+ 'brand_in_subdomain_not_domain': self._brand_subdomain_spoofing(domain, brands_in_url),
444
+ 'multiple_brands_in_url': 1 if len(brands_in_url) > 1 else 0,
445
+ 'brand_with_hyphen': self._brand_with_hyphen(domain),
446
+ 'suspicious_brand_tld': self._suspicious_brand_tld(domain),
447
+ 'brand_keyword_combo': self._brand_phishing_keyword_combo(url),
448
+ }
449
+
450
+ def _encoding_features(self, url: str, domain: str) -> dict:
451
+ """Encoding and obfuscation features."""
452
+ return {
453
+ 'has_url_encoding': 1 if '%' in url else 0,
454
+ 'encoding_count': url.count('%'),
455
+ 'encoding_diff': len(url) - len(unquote(url)),
456
+ 'has_punycode': 1 if 'xn--' in domain else 0,
457
+ 'has_unicode': 1 if any(ord(c) > 127 for c in url) else 0,
458
+ 'has_hex_string': 1 if re.search(r'0x[0-9a-f]{6,}', url.lower()) else 0,
459
+ 'has_base64': 1 if re.search(r'[A-Za-z0-9+/]{20,}={0,2}', url) else 0,
460
+ 'has_lookalike_chars': self._detect_lookalike_chars(domain),
461
+ 'mixed_script_score': self._mixed_script_detection(domain),
462
+ 'homograph_brand_risk': self._homograph_brand_check(domain),
463
+ 'suspected_idn_homograph': 1 if self._idn_homograph_score(url) > 0.5 else 0,
464
+ 'double_encoding': 1 if self._detect_double_encoding(url) else 0,
465
+ 'encoding_in_domain': 1 if '%' in domain else 0,
466
+ 'suspicious_unicode_category': self._suspicious_unicode_chars(url),
467
+ }
468
+
469
+ # ============================================================
470
+ # HELPER METHODS
471
+ # ============================================================
472
+
473
+ def _entropy(self, text: str) -> float:
474
+ """Calculate Shannon entropy."""
475
+ if not text:
476
+ return 0.0
477
+ counts = Counter(text)
478
+ probs = [count / len(text) for count in counts.values()]
479
+ return -sum(p * math.log2(p) for p in probs)
480
+
481
+ def _max_consecutive(self, text: str, condition) -> int:
482
+ """Max consecutive characters matching condition."""
483
+ if not text:
484
+ return 0
485
+ max_count = current = 0
486
+ for char in text:
487
+ if condition(char):
488
+ current += 1
489
+ max_count = max(max_count, current)
490
+ else:
491
+ current = 0
492
+ return max_count
493
+
494
+ def _max_consecutive_consonants(self, text: str) -> int:
495
+ """Max consecutive consonants."""
496
+ vowels = set('aeiou')
497
+ max_count = current = 0
498
+ for char in text.lower():
499
+ if char.isalpha() and char not in vowels:
500
+ current += 1
501
+ max_count = max(max_count, current)
502
+ else:
503
+ current = 0
504
+ return max_count
505
+
506
+ def _repeat_rate(self, text: str) -> float:
507
+ """Character repetition rate."""
508
+ if len(text) < 2:
509
+ return 0.0
510
+ repeats = sum(1 for i in range(len(text) - 1) if text[i] == text[i + 1])
511
+ return repeats / (len(text) - 1)
512
+
513
+ def _unique_ngram_ratio(self, text: str, n: int) -> float:
514
+ """Unique n-gram ratio."""
515
+ if len(text) < n:
516
+ return 1.0
517
+ ngrams = [text[i:i+n] for i in range(len(text) - n + 1)]
518
+ return len(set(ngrams)) / len(ngrams) if ngrams else 1.0
519
+
520
+ def _is_ip(self, domain: str) -> bool:
521
+ """Check if domain is an IP address."""
522
+ # IPv4
523
+ ipv4_pattern = r'^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$'
524
+ if re.match(ipv4_pattern, domain):
525
+ return True
526
+ # IPv6 (simplified)
527
+ if ':' in domain and domain.count(':') >= 2:
528
+ return True
529
+ return False
530
+
531
+ def _is_url_shortener(self, domain: str) -> int:
532
+ """Check if domain is a URL shortener (EXACT match only)."""
533
+ return 1 if domain in self.shorteners else 0
534
+
535
+ def _is_free_hosting(self, domain: str) -> int:
536
+ """Check if domain uses FREE HOSTING service."""
537
+ if domain in self.free_hosting:
538
+ return 1
539
+ for host in self.free_hosting:
540
+ if domain.endswith('.' + host):
541
+ return 1
542
+ return 0
543
+
544
+ def _is_free_platform(self, domain: str) -> int:
545
+ """Check if domain uses FREE PLATFORM (distinct from free hosting!)."""
546
+ if domain in self.free_platforms:
547
+ return 1
548
+ for platform in self.free_platforms:
549
+ if domain.endswith('.' + platform):
550
+ return 1
551
+ return 0
552
+
553
+ def _get_platform_subdomain_length(self, domain: str) -> int:
554
+ """Get subdomain length for free platforms."""
555
+ for platform in self.free_platforms:
556
+ if domain.endswith('.' + platform) or domain == platform:
557
+ if '.' in domain:
558
+ subdomain = domain.split('.')[0]
559
+ return len(subdomain)
560
+ return 0
561
+
562
+ def _detect_uuid_pattern(self, domain: str) -> int:
563
+ """Detect UUID-like subdomains (Replit, Firebase patterns)."""
564
+ parts = domain.split('.')
565
+ if len(parts) >= 2:
566
+ subdomain = parts[0]
567
+ # UUID-like: long alphanumeric with hyphens
568
+ if len(subdomain) >= 20 and '-' in subdomain:
569
+ return 1
570
+ return 0
571
+
572
+ def _calculate_domain_randomness(self, domain: str) -> float:
573
+ """Domain randomness score (0-1)."""
574
+ if not domain:
575
+ return 0.0
576
+
577
+ sld = domain.split('.')[-2] if domain.count('.') >= 1 else domain
578
+
579
+ # Factors: entropy, consonant clusters, no dictionary words
580
+ entropy_score = min(1.0, self._entropy(sld) / 4.5)
581
+ consonant_score = min(1.0, self._consonant_clustering_score(sld) / 3.0)
582
+ no_dict_word = 0.3 if not self._contains_dictionary_word(sld) else 0.0
583
+
584
+ return (entropy_score * 0.5 + consonant_score * 0.3 + no_dict_word * 0.2)
585
+
586
+ def _consonant_clustering_score(self, text: str) -> float:
587
+ """Consonant clustering score."""
588
+ max_consonants = self._max_consecutive_consonants(text)
589
+ return min(3.0, max_consonants / 2.0)
590
+
591
+ def _keyboard_pattern_score(self, text: str) -> int:
592
+ """Check for keyboard patterns."""
593
+ text_lower = text.lower()
594
+ for pattern in self.keyboard_patterns:
595
+ if pattern in text_lower:
596
+ return 1
597
+ return 0
598
+
599
+ def _contains_dictionary_word(self, text: str) -> int:
600
+ """Check if text contains common English word."""
601
+ text_lower = text.lower()
602
+ for word in self.common_words:
603
+ if len(word) >= 4 and word in text_lower:
604
+ return 1
605
+ return 0
606
+
607
+ def _pronounceability_score(self, text: str) -> float:
608
+ """Pronounceability score based on vowel/consonant alternation."""
609
+ if len(text) < 3:
610
+ return 0.5
611
+
612
+ vowels = set('aeiou')
613
+ alternations = 0
614
+
615
+ for i in range(len(text) - 1):
616
+ c1, c2 = text[i].lower(), text[i + 1].lower()
617
+ if c1.isalpha() and c2.isalpha():
618
+ if (c1 in vowels) != (c2 in vowels):
619
+ alternations += 1
620
+
621
+ return min(1.0, alternations / (len(text) - 1))
622
+
623
+ def _suspicious_digit_position(self, text: str) -> int:
624
+ """Check for suspicious digit positions (digits at start/end)."""
625
+ if not text:
626
+ return 0
627
+ if text[0].isdigit() or text[-1].isdigit():
628
+ return 1
629
+ return 0
630
+
631
+ def _brand_impersonation_score(self, domain: str, brands_in_url: list) -> int:
632
+ """Check if brand appears in suspicious way."""
633
+ if not brands_in_url:
634
+ return 0
635
+
636
+ domain_lower = domain.lower()
637
+ for brand in brands_in_url:
638
+ # Brand in subdomain or with separator
639
+ if f"{brand}-" in domain_lower or f"{brand}." in domain_lower:
640
+ # But not as main domain
641
+ if not domain_lower.endswith(f".{brand}.com"):
642
+ return 1
643
+ return 0
644
+
645
+ def _brand_subdomain_spoofing(self, domain: str, brands: list) -> int:
646
+ """Brand in subdomain but not main domain."""
647
+ if not brands:
648
+ return 0
649
+
650
+ parts = domain.split('.')
651
+ if len(parts) >= 3:
652
+ subdomain = '.'.join(parts[:-2])
653
+ main_domain = '.'.join(parts[-2:])
654
+
655
+ for brand in self.brand_names:
656
+ if brand in subdomain.lower() and brand not in main_domain.lower():
657
+ return 1
658
+ return 0
659
+
660
+ def _brand_with_hyphen(self, domain: str) -> int:
661
+ """Brand name with hyphen (spoofing technique)."""
662
+ domain_lower = domain.lower()
663
+ for brand in self.brand_names:
664
+ if f"{brand}-" in domain_lower or f"-{brand}" in domain_lower:
665
+ return 1
666
+ return 0
667
+
668
+ def _suspicious_brand_tld(self, domain: str) -> int:
669
+ """Brand in domain with suspicious TLD."""
670
+ parts = domain.split('.')
671
+ if len(parts) >= 2:
672
+ sld = parts[-2].lower()
673
+ tld = parts[-1].lower()
674
+
675
+ if sld in self.brand_names and tld in self.suspicious_tlds:
676
+ return 1
677
+ return 0
678
+
679
+ def _brand_phishing_keyword_combo(self, url: str) -> int:
680
+ """Brand + phishing keyword combination."""
681
+ url_lower = url.lower()
682
+ has_brand = any(b in url_lower for b in self.brand_names)
683
+ has_phishing = any(k in url_lower for k in self.phishing_keywords)
684
+ return 1 if has_brand and has_phishing else 0
685
+
686
+ def _categorize_extension(self, ext: str) -> int:
687
+ """Categorize file extension (0=none, 1=doc, 2=media, 3=exec, 4=web, 5=other)."""
688
+ if not ext:
689
+ return 0
690
+
691
+ doc_exts = {'pdf', 'doc', 'docx', 'xls', 'xlsx', 'txt'}
692
+ media_exts = {'jpg', 'jpeg', 'png', 'gif', 'mp4', 'mp3'}
693
+ exec_exts = {'exe', 'bat', 'cmd', 'scr', 'vbs', 'ps1'}
694
+ web_exts = {'html', 'htm', 'php', 'asp', 'jsp'}
695
+
696
+ if ext in doc_exts:
697
+ return 1
698
+ elif ext in media_exts:
699
+ return 2
700
+ elif ext in exec_exts:
701
+ return 3
702
+ elif ext in web_exts:
703
+ return 4
704
+ return 5
705
+
706
+ def _character_diversity(self, text: str) -> float:
707
+ """Character diversity (unique chars / total chars)."""
708
+ if not text:
709
+ return 0.0
710
+ return len(set(text)) / len(text)
711
+
712
+ def _calculate_url_complexity(self, url: str) -> float:
713
+ """Overall URL complexity score."""
714
+ complexity = 0.0
715
+ complexity += min(1.0, len(url) / 100) # Length factor
716
+ complexity += min(1.0, url.count('.') / 5) # Dots
717
+ complexity += min(1.0, url.count('-') / 3) # Hyphens
718
+ complexity += min(1.0, url.count('/') / 5) # Paths
719
+ complexity += min(1.0, self._entropy(url) / 5) # Entropy
720
+ return complexity / 5
721
+
722
+ def _detect_lookalike_chars(self, domain: str) -> int:
723
+ """Detect lookalike character substitutions."""
724
+ suspicious_patterns = ['rn', 'vv', 'cl', '0', '1']
725
+ domain_lower = domain.lower()
726
+ for pattern in suspicious_patterns:
727
+ if pattern in domain_lower:
728
+ return 1
729
+ return 0
730
+
731
+ def _mixed_script_detection(self, domain: str) -> int:
732
+ """Detect mixed scripts (Cyrillic + Latin)."""
733
+ latin = sum(1 for c in domain if ord(c) < 128 and c.isalpha())
734
+ non_latin = sum(1 for c in domain if ord(c) >= 128 and c.isalpha())
735
+
736
+ if latin > 0 and non_latin > 0:
737
+ return min(3, non_latin)
738
+ return 0
739
+
740
+ def _homograph_brand_check(self, domain: str) -> int:
741
+ """Check for homograph attack on brand names."""
742
+ # Simplified: check if domain looks like a brand but has non-ASCII
743
+ has_non_ascii = any(ord(c) > 127 for c in domain)
744
+ looks_like_brand = any(brand in domain.lower() for brand in self.brand_names[:10])
745
+
746
+ return 1 if has_non_ascii and looks_like_brand else 0
747
+
748
+ def _idn_homograph_score(self, url: str) -> float:
749
+ """Calculate IDN homograph attack score."""
750
+ if 'xn--' not in url:
751
+ return 0.0
752
+
753
+ # Punycode detected - potential IDN homograph
754
+ non_ascii_count = sum(1 for c in url if ord(c) > 127)
755
+ return min(1.0, non_ascii_count / 10)
756
+
757
+ def _detect_double_encoding(self, url: str) -> int:
758
+ """Detect double URL encoding (%%25)."""
759
+ if '%%' in url or '%25' in url:
760
+ return 1
761
+ return 0
762
+
763
+ def _suspicious_unicode_chars(self, url: str) -> int:
764
+ """Count suspicious Unicode characters."""
765
+ # Check for right-to-left override, zero-width, etc.
766
+ suspicious = sum(1 for c in url if ord(c) in [0x202E, 0x200B, 0x200C, 0x200D, 0xFEFF])
767
+ return min(5, suspicious)
768
+
769
+ def _get_default_features(self) -> dict:
770
+ """Return default features for invalid URLs."""
771
+ feature_names = self.get_feature_names()
772
+ return {name: 0 for name in feature_names}
773
+
774
+ def get_feature_names(self) -> list:
775
+ """Get all feature names in order."""
776
+ dummy_features = self.extract_features("https://example.com/test")
777
+ return list(dummy_features.keys())
778
+
779
+ def extract_batch(self, urls: list, show_progress: bool = True) -> pd.DataFrame:
780
+ """Extract features from batch of URLs."""
781
+ if show_progress:
782
+ from tqdm import tqdm
783
+ features = [self.extract_features(url) for url in tqdm(urls, desc="Extracting")]
784
+ else:
785
+ features = [self.extract_features(url) for url in urls]
786
+
787
+ return pd.DataFrame(features)
788
+
789
+
790
+ def main():
791
+ """Extract URL features from dataset with normalization."""
792
+ import sys
793
+
794
+ parser = argparse.ArgumentParser(description='URL Feature Extraction v2.2 OPTIMIZED')
795
+ parser.add_argument('--sample', type=int, default=None, help='Sample N URLs')
796
+ parser.add_argument('--output', type=str, default=None, help='Output filename')
797
+ args = parser.parse_args()
798
+
799
+ logger.info("=" * 70)
800
+ logger.info("URL Feature Extraction v3 - OPTIMIZED & NORMALIZED")
801
+ logger.info("=" * 70)
802
+ logger.info("")
803
+ logger.info("KEY IMPROVEMENTS:")
804
+ logger.info(" ✅ URL Normalization (www/non-www consistent)")
805
+ logger.info(" ✅ Scheme normalization (http/https handling)")
806
+ logger.info(" ✅ Separated is_free_hosting from is_free_platform")
807
+ logger.info(" ✅ Removed redundant www features")
808
+ logger.info(" ✅ Optimized for production")
809
+ logger.info("")
810
+
811
+ # Load dataset
812
+ script_dir = Path(__file__).parent
813
+ data_file = (script_dir / '../../../data/processed/url_dataset_balanced.csv').resolve()
814
+
815
+ logger.info(f"Loading: {data_file.name}")
816
+ df = pd.read_csv(data_file)
817
+ logger.info(f"Loaded: {len(df):,} URLs")
818
+
819
+ if args.sample and args.sample < len(df):
820
+ df = df.sample(n=args.sample, random_state=42)
821
+ logger.info(f"Sampled: {len(df):,} URLs")
822
+
823
+ # Extract features
824
+ extractor = URLFeatureExtractorOptimized()
825
+ features_df = extractor.extract_batch(df['url'].tolist())
826
+ features_df['label'] = df['label'].values
827
+
828
+ # Save
829
+ output_dir = (script_dir / '../../../data/features').resolve()
830
+ output_dir.mkdir(parents=True, exist_ok=True)
831
+
832
+ if args.output:
833
+ output_file = output_dir / args.output
834
+ else:
835
+ output_file = output_dir / 'url_features_790k.csv'
836
+
837
+ features_df.to_csv(output_file, index=False)
838
+
839
+ logger.info("")
840
+ logger.info("=" * 70)
841
+ logger.info(f"✓ Saved: {output_file}")
842
+ logger.info(f" Shape: {features_df.shape}")
843
+ logger.info(f" Features: {len(features_df.columns) - 1}")
844
+ logger.info("=" * 70)
845
+
846
+ # Test normalization
847
+ logger.info("")
848
+ logger.info("NORMALIZATION TEST:")
849
+ test_urls = [
850
+ "https://github.com/user/repo",
851
+ "http://www.github.com/user/repo",
852
+ "www.github.com/user/repo",
853
+ "github.com/user/repo"
854
+ ]
855
+
856
+ for url in test_urls:
857
+ norm_url, orig, norm_domain, is_http = extractor.normalize_url(url)
858
+ logger.info(f" {url}")
859
+ logger.info(f" → {norm_domain} (http={is_http})")
860
+
861
+ logger.info("")
862
+ logger.info("All normalized URLs should have identical domain: 'github.com'")
863
+
864
+
865
+ if __name__ == "__main__":
866
+ main()
scripts/phishing_analysis/analysis.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from urllib.parse import urlparse
3
+ import re
4
+
5
+ # Load phishing URLs
6
+ phish_df = pd.read_csv('phishing_urls.csv')
7
+
8
+ print("=== PHISHING DATASET INFO ===")
9
+ print(f"Total phishing URLs: {len(phish_df)}")
10
+ print(f"Columns: {phish_df.columns.tolist()}\n")
11
+
12
+ # Assume URL column is 'url' (adjust if different)
13
+ url_column = 'url' # Change to your actual column name
14
+
15
+ print("=== PHISHING TYPE ANALYSIS (from raw URLs) ===\n")
16
+
17
+ # Function to analyze URL
18
+ def analyze_phishing_type(url):
19
+ """Determine phishing type from raw URL"""
20
+ url = str(url).lower()
21
+ parsed = urlparse(url)
22
+ domain = parsed.netloc
23
+ path = parsed.path
24
+
25
+ result = {
26
+ 'url': url,
27
+ 'domain': domain,
28
+ 'path': path,
29
+ 'type': 'unknown'
30
+ }
31
+
32
+ # 1. IP-based phishing
33
+ ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
34
+ if re.search(ip_pattern, domain):
35
+ result['type'] = 'ip_based'
36
+ return result
37
+
38
+ # 2. Brand impersonation (check for known brands)
39
+ brands = [
40
+ 'paypal', 'amazon', 'apple', 'google', 'microsoft', 'facebook',
41
+ 'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'bank',
42
+ 'chase', 'wellsfargo', 'citi', 'americanexpress', 'visa', 'mastercard',
43
+ 'dhl', 'fedex', 'ups', 'usps', 'alibaba', 'walmart', 'adobe',
44
+ 'dropbox', 'office365', 'outlook', 'yahoo', 'aol', 'whatsapp'
45
+ ]
46
+
47
+ # Check if brand in URL but not in main domain
48
+ url_full = domain + path
49
+ sld = domain.split('.')[-2] if len(domain.split('.')) >= 2 else domain
50
+
51
+ brand_found = None
52
+ for brand in brands:
53
+ if brand in url_full:
54
+ brand_found = brand
55
+ # Check if brand is actually the main domain
56
+ if brand == sld or sld.startswith(brand):
57
+ # Legitimate brand usage
58
+ break
59
+ else:
60
+ # Brand impersonation
61
+ result['type'] = 'brand_impersonation'
62
+ result['brand'] = brand
63
+ return result
64
+
65
+ # 3. Phishing keywords (generic phishing)
66
+ phishing_keywords = [
67
+ 'login', 'signin', 'verify', 'account', 'update', 'secure',
68
+ 'confirm', 'suspended', 'locked', 'alert', 'urgent', 'validate',
69
+ 'banking', 'credential', 'auth', 'password', 'restore', 'recover'
70
+ ]
71
+
72
+ keyword_count = sum(1 for kw in phishing_keywords if kw in url_full)
73
+ if keyword_count >= 2:
74
+ result['type'] = 'generic_phishing'
75
+ result['keyword_count'] = keyword_count
76
+ return result
77
+
78
+ # 4. Suspicious TLD
79
+ suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
80
+ if any(tld in domain for tld in suspicious_tlds):
81
+ result['type'] = 'suspicious_tld'
82
+ return result
83
+
84
+ # 5. Compromised site (trusted TLD + phishing in path)
85
+ trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov']
86
+ if any(tld in domain for tld in trusted_tlds):
87
+ if any(kw in path for kw in phishing_keywords):
88
+ result['type'] = 'compromised_site'
89
+ return result
90
+
91
+ # Default
92
+ result['type'] = 'other'
93
+ return result
94
+
95
+ # Analyze all URLs
96
+ print("Analyzing URLs... (this may take a minute)")
97
+ results = []
98
+ for url in phish_df[url_column]:
99
+ results.append(analyze_phishing_type(url))
100
+
101
+ results_df = pd.DataFrame(results)
102
+
103
+ # Count types
104
+ type_counts = results_df['type'].value_counts()
105
+
106
+ print("\n=== PHISHING TYPE DISTRIBUTION ===")
107
+ for ptype, count in type_counts.items():
108
+ percentage = (count / len(phish_df)) * 100
109
+ print(f"{ptype:20s}: {count:6d} / {len(phish_df)} ({percentage:5.1f}%)")
110
+
111
+ # Domain characteristics
112
+ print("\n=== DOMAIN CHARACTERISTICS ===")
113
+
114
+ # Domain lengths
115
+ domain_lengths = results_df['domain'].apply(len)
116
+ print(f"Avg domain length: {domain_lengths.mean():.1f} chars")
117
+ print(f"Median domain length: {domain_lengths.median():.1f} chars")
118
+
119
+ # Number of domain parts
120
+ num_parts = results_df['domain'].apply(lambda d: len(d.split('.')))
121
+ print(f"Avg domain parts: {num_parts.mean():.1f}")
122
+ print(f"Median domain parts: {num_parts.median():.1f}")
123
+
124
+ # Number of subdomains
125
+ num_subdomains = num_parts - 2 # Subtract SLD and TLD
126
+ print(f"Avg subdomains: {num_subdomains.mean():.1f}")
127
+
128
+ # Path characteristics
129
+ print("\n=== PATH CHARACTERISTICS ===")
130
+ path_lengths = results_df['path'].apply(len)
131
+ print(f"Avg path length: {path_lengths.mean():.1f} chars")
132
+ print(f"URLs with paths: {(path_lengths > 1).sum()} / {len(phish_df)} ({(path_lengths > 1).sum()/len(phish_df)*100:.1f}%)")
133
+
134
+ # Show examples
135
+ print("\n=== EXAMPLES BY TYPE ===")
136
+ for ptype in type_counts.index[:5]:
137
+ examples = results_df[results_df['type'] == ptype]['url'].head(3)
138
+ print(f"\n{ptype.upper()}:")
139
+ for i, ex in enumerate(examples, 1):
140
+ print(f" {i}. {ex[:100]}...")
141
+
142
+ # Save detailed results
143
+ results_df.to_csv('phishing_type_analysis.csv', index=False)
144
+ print("\n✅ Detailed results saved to: phishing_type_analysis.csv")
scripts/phishing_analysis/phishing_analysis.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from urllib.parse import urlparse
3
+ from collections import Counter
4
+ import re
5
+
6
+ # Load detailed results
7
+ results_df = pd.read_csv('data/raw/clean_dataset_no_duplicates.csv')
8
+
9
+ print("=== DETAILED 'OTHER' CATEGORY ANALYSIS ===\n")
10
+
11
+ # Filter only 'other' type
12
+ other_df = results_df[results_df['type'] == 'other']
13
+
14
+ # 1. TLD distribution
15
+ print("TOP 20 TLDs in 'OTHER' category:")
16
+ tlds = other_df['domain'].apply(lambda d: '.' + d.split('.')[-1] if '.' in d else '')
17
+ tld_counts = Counter(tlds)
18
+ for tld, count in tld_counts.most_common(20):
19
+ pct = (count / len(other_df)) * 100
20
+ print(f" {tld:10s}: {count:5d} ({pct:4.1f}%)")
21
+
22
+ # 2. Domain length distribution
23
+ print("\n=== DOMAIN LENGTH DISTRIBUTION (OTHER) ===")
24
+ lengths = other_df['domain'].str.len()
25
+ print(f"Min: {lengths.min()}")
26
+ print(f"25%: {lengths.quantile(0.25):.0f}")
27
+ print(f"50%: {lengths.median():.0f}")
28
+ print(f"75%: {lengths.quantile(0.75):.0f}")
29
+ print(f"Max: {lengths.max()}")
30
+
31
+ # 3. Check for non-English brands/keywords
32
+ print("\n=== POTENTIAL NON-ENGLISH BRANDS/KEYWORDS ===")
33
+
34
+ # Common patterns in 'other'
35
+ all_domains = ' '.join(other_df['domain'].tolist()).lower()
36
+
37
+ # Find common substrings
38
+ from collections import defaultdict
39
+ substring_counts = defaultdict(int)
40
+
41
+ for domain in other_df['domain']:
42
+ domain = domain.lower()
43
+ # Extract words (split by dots, hyphens)
44
+ parts = re.split(r'[\.\-_]', domain)
45
+ for part in parts:
46
+ if len(part) >= 5: # Min 5 chars
47
+ substring_counts[part] += 1
48
+
49
+ # Top recurring words
50
+ print("Top 30 recurring words in domains:")
51
+ for word, count in sorted(substring_counts.items(), key=lambda x: x[1], reverse=True)[:30]:
52
+ if count >= 10: # Appears at least 10 times
53
+ print(f" {word:30s}: {count:4d} occurrences")
54
+
55
+ # 4. Digit patterns
56
+ print("\n=== DIGIT PATTERNS ===")
57
+ has_digits = other_df['domain'].str.contains(r'\d')
58
+ print(f"Domains with digits: {has_digits.sum()} / {len(other_df)} ({has_digits.sum()/len(other_df)*100:.1f}%)")
59
+
60
+ # 5. Length of longest part
61
+ print("\n=== LONGEST DOMAIN PART ===")
62
+ longest_parts = other_df['domain'].apply(lambda d: max(d.split('.'), key=len))
63
+ longest_part_lens = longest_parts.str.len()
64
+ print(f"Avg longest part: {longest_part_lens.mean():.1f} chars")
65
+ print(f"Median longest part: {longest_part_lens.median():.1f} chars")
66
+
67
+ # Show some examples of long domains
68
+ print("\nExamples of domains with longest part > 30 chars:")
69
+ long_domains = other_df[longest_part_lens > 30]['url'].head(10)
70
+ for url in long_domains:
71
+ print(f" {url[:100]}...")
72
+
73
+ # 6. Hyphen analysis
74
+ print("\n=== HYPHEN ANALYSIS ===")
75
+ hyphen_counts = other_df['domain'].str.count('-')
76
+ print(f"Avg hyphens per domain: {hyphen_counts.mean():.2f}")
77
+ print(f"Domains with 3+ hyphens: {(hyphen_counts >= 3).sum()} ({(hyphen_counts >= 3).sum()/len(other_df)*100:.1f}%)")
78
+
79
+ # 7. Subdomain analysis
80
+ print("\n=== SUBDOMAIN ANALYSIS ===")
81
+ num_parts = other_df['domain'].str.count(r'\.') + 1
82
+ num_subdomains = num_parts - 2
83
+ print(f"Domains with 2+ subdomains: {(num_subdomains >= 2).sum()} ({(num_subdomains >= 2).sum()/len(other_df)*100:.1f}%)")
84
+
85
+ print("\n✅ Analysis complete!")
scripts/phishing_analysis/phishing_type_analysis.csv ADDED
The diff for this file is too large to render. See raw diff
 
scripts/predict_combined.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Combined URL+HTML Phishing Detector - Interactive Demo
3
+
4
+ Downloads HTML from URL, extracts both URL and HTML features,
5
+ and predicts using the combined model (XGBoost + Random Forest).
6
+
7
+ Usage:
8
+ python scripts/predict_combined.py
9
+ python scripts/predict_combined.py https://example.com
10
+ """
11
+ import sys
12
+ import logging
13
+ import warnings
14
+ from pathlib import Path
15
+
16
+ import joblib
17
+ import numpy as np
18
+ import pandas as pd
19
+ import requests
20
+ from colorama import init, Fore, Style
21
+
22
+ warnings.filterwarnings('ignore', message='.*Unverified HTTPS.*')
23
+ import urllib3
24
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
25
+
26
+ init(autoreset=True)
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(levelname)s - %(message)s',
31
+ datefmt='%H:%M:%S',
32
+ )
33
+ logger = logging.getLogger('predict_combined')
34
+
35
+ # Project imports
36
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
37
+ sys.path.insert(0, str(PROJECT_ROOT))
38
+
39
+ from scripts.feature_extraction.url.url_features_v3 import URLFeatureExtractorOptimized
40
+ from scripts.feature_extraction.html.html_feature_extractor import HTMLFeatureExtractor
41
+ from scripts.feature_extraction.html.feature_engineering import engineer_features
42
+
43
+
44
+ class CombinedPhishingDetector:
45
+ """Detect phishing using combined URL + HTML features."""
46
+
47
+ HEADERS = {
48
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
49
+ 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
50
+ }
51
+
52
+ def __init__(self):
53
+ models_dir = PROJECT_ROOT / 'saved_models'
54
+
55
+ # Feature extractors
56
+ self.url_extractor = URLFeatureExtractorOptimized()
57
+ self.html_extractor = HTMLFeatureExtractor()
58
+
59
+ # Load combined models
60
+ self.models = {}
61
+ self._load_model(models_dir, 'XGBoost Combined',
62
+ 'xgboost_combined.joblib',
63
+ 'xgboost_combined_feature_names.joblib')
64
+ self._load_model(models_dir, 'Random Forest Combined',
65
+ 'random_forest_combined.joblib',
66
+ 'random_forest_combined_feature_names.joblib')
67
+
68
+ if not self.models:
69
+ raise FileNotFoundError(
70
+ "No combined models found! Train first:\n"
71
+ " python scripts/merge_url_html_features.py --balance\n"
72
+ " python models/train_combined_models.py")
73
+
74
+ def _load_model(self, models_dir: Path, name: str,
75
+ model_file: str, features_file: str):
76
+ model_path = models_dir / model_file
77
+ feat_path = models_dir / features_file
78
+ if model_path.exists():
79
+ self.models[name] = {
80
+ 'model': joblib.load(model_path),
81
+ 'features': joblib.load(feat_path) if feat_path.exists() else None,
82
+ }
83
+ n = len(self.models[name]['features']) if self.models[name]['features'] else '?'
84
+ logger.info(f"Loaded {name} ({n} features)")
85
+
86
+ def predict(self, url: str) -> dict:
87
+ """Download HTML, extract features, predict."""
88
+ # 1. Extract URL features
89
+ url_features = self.url_extractor.extract_features(url)
90
+ url_df = pd.DataFrame([url_features])
91
+ url_df = url_df.rename(columns={c: f'url_{c}' for c in url_df.columns})
92
+
93
+ # 2. Download + extract HTML features
94
+ html_features = None
95
+ html_error = None
96
+ try:
97
+ resp = requests.get(url, timeout=10, verify=False, headers=self.HEADERS)
98
+ raw_html_features = self.html_extractor.extract_features(resp.text)
99
+ raw_df = pd.DataFrame([raw_html_features])
100
+ eng_df = engineer_features(raw_df)
101
+ eng_df = eng_df.rename(columns={c: f'html_{c}' for c in eng_df.columns})
102
+ html_features = raw_html_features
103
+ except Exception as e:
104
+ html_error = str(e)
105
+ logger.warning(f"Could not download HTML: {e}")
106
+ # Create zero-filled HTML features
107
+ eng_df = pd.DataFrame()
108
+
109
+ # 3. Combine
110
+ combined_df = pd.concat([url_df, eng_df], axis=1)
111
+
112
+ # 4. Predict with each model
113
+ predictions = []
114
+ for name, data in self.models.items():
115
+ model = data['model']
116
+ expected = data['features']
117
+
118
+ if expected:
119
+ aligned = pd.DataFrame(columns=expected)
120
+ for f in expected:
121
+ aligned[f] = combined_df[f].values if f in combined_df.columns else 0
122
+ X = aligned.values
123
+ else:
124
+ X = combined_df.values
125
+
126
+ proba = model.predict_proba(X)[0]
127
+ pred = 1 if proba[1] > 0.5 else 0
128
+
129
+ predictions.append({
130
+ 'model_name': name,
131
+ 'prediction': 'PHISHING' if pred else 'LEGITIMATE',
132
+ 'confidence': float(proba[pred] * 100),
133
+ 'phishing_probability': float(proba[1] * 100),
134
+ 'legitimate_probability': float(proba[0] * 100),
135
+ })
136
+
137
+ # Consensus
138
+ phishing_votes = sum(1 for p in predictions if p['prediction'] == 'PHISHING')
139
+ total = len(predictions)
140
+ is_phishing = phishing_votes > total / 2
141
+
142
+ if phishing_votes == total:
143
+ consensus = "ALL MODELS AGREE: PHISHING"
144
+ elif phishing_votes == 0:
145
+ consensus = "ALL MODELS AGREE: LEGITIMATE"
146
+ else:
147
+ consensus = f"MIXED: {phishing_votes}/{total} models say PHISHING"
148
+
149
+ return {
150
+ 'url': url,
151
+ 'is_phishing': is_phishing,
152
+ 'consensus': consensus,
153
+ 'predictions': predictions,
154
+ 'url_features': url_features,
155
+ 'html_features': html_features,
156
+ 'html_error': html_error,
157
+ }
158
+
159
+ def print_results(self, result: dict):
160
+ """Pretty-print results."""
161
+ print("\n" + "=" * 80)
162
+ print(f"{Fore.CYAN}{Style.BRIGHT}COMBINED URL+HTML PHISHING DETECTION{Style.RESET_ALL}")
163
+ print("=" * 80)
164
+ print(f"\n{Fore.YELLOW}URL:{Style.RESET_ALL} {result['url']}")
165
+
166
+ if result.get('html_error'):
167
+ print(f"{Fore.RED}HTML download failed: {result['html_error']}{Style.RESET_ALL}")
168
+ print(f"{Fore.YELLOW}Using URL features only (HTML features zeroed){Style.RESET_ALL}")
169
+
170
+ # Model predictions
171
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}MODEL PREDICTIONS:{Style.RESET_ALL}")
172
+ print("-" * 80)
173
+
174
+ for pred in result['predictions']:
175
+ is_safe = pred['prediction'] == 'LEGITIMATE'
176
+ color = Fore.GREEN if is_safe else Fore.RED
177
+ icon = "✓" if is_safe else "⚠"
178
+
179
+ print(f"\n{Style.BRIGHT}{pred['model_name']}:{Style.RESET_ALL}")
180
+ print(f" {icon} Prediction: {color}{Style.BRIGHT}{pred['prediction']}{Style.RESET_ALL}")
181
+ print(f" Confidence: {pred['confidence']:.1f}%")
182
+ print(f" Phishing: {Fore.RED}{pred['phishing_probability']:6.2f}%{Style.RESET_ALL}")
183
+ print(f" Legitimate: {Fore.GREEN}{pred['legitimate_probability']:6.2f}%{Style.RESET_ALL}")
184
+
185
+ # Consensus
186
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}CONSENSUS:{Style.RESET_ALL}")
187
+ print("-" * 80)
188
+
189
+ if result['is_phishing']:
190
+ print(f"🚨 {Fore.RED}{Style.BRIGHT}{result['consensus']}{Style.RESET_ALL}")
191
+ else:
192
+ print(f"✅ {Fore.GREEN}{Style.BRIGHT}{result['consensus']}{Style.RESET_ALL}")
193
+
194
+ # Key features
195
+ url_feat = result.get('url_features', {})
196
+ html_feat = result.get('html_features', {})
197
+
198
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}KEY URL FEATURES:{Style.RESET_ALL}")
199
+ print("-" * 80)
200
+ url_keys = [
201
+ ('Domain Length', url_feat.get('domain_length', 0)),
202
+ ('Num Subdomains', url_feat.get('num_subdomains', 0)),
203
+ ('Domain Dots', url_feat.get('domain_dots', 0)),
204
+ ('Is Shortened', 'Yes' if url_feat.get('is_shortened') else 'No'),
205
+ ('Is Free Platform', 'Yes' if url_feat.get('is_free_platform') else 'No'),
206
+ ('Is HTTP', 'Yes' if url_feat.get('is_http') else 'No'),
207
+ ('Has @ Symbol', 'Yes' if url_feat.get('has_at_symbol') else 'No'),
208
+ ]
209
+ for name, val in url_keys:
210
+ print(f" {name:25s}: {val}")
211
+
212
+ if html_feat:
213
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}KEY HTML FEATURES:{Style.RESET_ALL}")
214
+ print("-" * 80)
215
+ html_keys = [
216
+ ('Text Length', html_feat.get('text_length', 0)),
217
+ ('Num Links', html_feat.get('num_links', 0)),
218
+ ('Num Forms', html_feat.get('num_forms', 0)),
219
+ ('Password Fields', html_feat.get('num_password_fields', 0)),
220
+ ('Has Login Form', 'Yes' if html_feat.get('has_login_form') else 'No'),
221
+ ('Has Meta Refresh', 'Yes' if html_feat.get('has_meta_refresh') else 'No'),
222
+ ('Has atob()', 'Yes' if html_feat.get('has_atob') else 'No'),
223
+ ('External Links', html_feat.get('num_external_links', 0)),
224
+ ]
225
+ for name, val in html_keys:
226
+ print(f" {name:25s}: {val}")
227
+
228
+ print("\n" + "=" * 80 + "\n")
229
+
230
+
231
+ def main():
232
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}")
233
+ print("╔══════════════════════════════════════════════════════════════╗")
234
+ print("║ COMBINED URL+HTML PHISHING DETECTOR ║")
235
+ print("╚══════════════════════════════════════════════════════════════╝")
236
+ print(f"{Style.RESET_ALL}")
237
+
238
+ print(f"{Fore.YELLOW}Loading models...{Style.RESET_ALL}")
239
+ detector = CombinedPhishingDetector()
240
+ print(f"{Fore.GREEN}✓ Models loaded!{Style.RESET_ALL}\n")
241
+
242
+ # Single URL from command line
243
+ if len(sys.argv) > 1:
244
+ url = sys.argv[1]
245
+ if not url.startswith(('http://', 'https://')):
246
+ url = 'https://' + url
247
+ result = detector.predict(url)
248
+ detector.print_results(result)
249
+ return
250
+
251
+ # Interactive loop
252
+ while True:
253
+ print(f"{Fore.CYAN}{'─' * 80}{Style.RESET_ALL}")
254
+ url = input(f"{Fore.YELLOW}Enter URL (or 'quit'):{Style.RESET_ALL} ").strip()
255
+
256
+ if url.lower() in ('quit', 'exit', 'q'):
257
+ print(f"\n{Fore.GREEN}Goodbye!{Style.RESET_ALL}\n")
258
+ break
259
+
260
+ if not url:
261
+ continue
262
+
263
+ if not url.startswith(('http://', 'https://')):
264
+ url = 'https://' + url
265
+
266
+ try:
267
+ result = detector.predict(url)
268
+ detector.print_results(result)
269
+ except Exception as e:
270
+ print(f"\n{Fore.RED}Error: {e}{Style.RESET_ALL}\n")
271
+
272
+
273
+ if __name__ == '__main__':
274
+ main()
scripts/predict_html.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HTML Phishing Detection - Interactive Prediction
3
+ Predicts if HTML file/URL is phishing using trained model
4
+ """
5
+ import sys
6
+ from pathlib import Path
7
+ import joblib
8
+ import pandas as pd
9
+ from colorama import init, Fore, Style
10
+ import requests
11
+
12
+ # Add project root to path
13
+ sys.path.append(str(Path(__file__).parent.parent))
14
+
15
+ from scripts.feature_extraction.html.html_feature_extractor import HTMLFeatureExtractor
16
+ from scripts.feature_extraction.html.feature_engineering import engineer_features
17
+
18
+ # Initialize colorama
19
+ init(autoreset=True)
20
+
21
+
22
+ class HTMLPhishingPredictor:
23
+ """Predict phishing from HTML content using trained models."""
24
+
25
+ def __init__(self):
26
+ """Initialize predictor with all trained models."""
27
+ models_dir = Path('saved_models')
28
+
29
+ # Load Random Forest model and its feature names
30
+ rf_model_path = models_dir / 'random_forest_html.joblib'
31
+ rf_features_path = models_dir / 'random_forest_html_feature_names.joblib'
32
+ if rf_model_path.exists():
33
+ print(f"Loading Random Forest model: {rf_model_path}")
34
+ self.rf_model = joblib.load(rf_model_path)
35
+ self.has_rf = True
36
+ # Load RF feature names
37
+ if rf_features_path.exists():
38
+ self.rf_feature_names = joblib.load(rf_features_path)
39
+ print(f"Loaded {len(self.rf_feature_names)} Random Forest feature names")
40
+ else:
41
+ self.rf_feature_names = None
42
+ else:
43
+ print(f"{Fore.YELLOW}Random Forest model not found{Style.RESET_ALL}")
44
+ self.rf_model = None
45
+ self.has_rf = False
46
+ self.rf_feature_names = None
47
+
48
+ # Load XGBoost model and its feature names
49
+ xgb_model_path = models_dir / 'xgboost_html.joblib'
50
+ xgb_features_path = models_dir / 'xgboost_html_feature_names.joblib'
51
+ if xgb_model_path.exists():
52
+ print(f"Loading XGBoost model: {xgb_model_path}")
53
+ self.xgb_model = joblib.load(xgb_model_path)
54
+ self.has_xgb = True
55
+ # Load XGBoost feature names
56
+ if xgb_features_path.exists():
57
+ self.xgb_feature_names = joblib.load(xgb_features_path)
58
+ print(f"Loaded {len(self.xgb_feature_names)} XGBoost feature names")
59
+ else:
60
+ self.xgb_feature_names = None
61
+ else:
62
+ print(f"{Fore.YELLOW}XGBoost model not found{Style.RESET_ALL}")
63
+ self.xgb_model = None
64
+ self.has_xgb = False
65
+ self.xgb_feature_names = None
66
+
67
+ if not self.has_rf and not self.has_xgb:
68
+ raise FileNotFoundError("No trained models found! Train models first.")
69
+
70
+ self.extractor = HTMLFeatureExtractor()
71
+
72
+ def predict_from_file(self, html_file_path):
73
+ """Predict from HTML file."""
74
+ # Read HTML content
75
+ with open(html_file_path, 'r', encoding='utf-8', errors='ignore') as f:
76
+ html_content = f.read()
77
+
78
+ return self.predict_from_html(html_content, source=str(html_file_path))
79
+
80
+ def predict_from_url(self, url):
81
+ """Download HTML from URL and predict."""
82
+ print(f"\nDownloading HTML from: {url}")
83
+
84
+ try:
85
+ # Download HTML
86
+ response = requests.get(url, timeout=10, verify=False)
87
+ html_content = response.text
88
+
89
+ return self.predict_from_html(html_content, source=url)
90
+
91
+ except Exception as e:
92
+ print(f"{Fore.RED}Error downloading URL: {e}")
93
+ return None
94
+
95
+ def predict_from_html(self, html_content, source=""):
96
+ """Predict from HTML content using all available models."""
97
+ # Extract raw features
98
+ features = self.extractor.extract_features(html_content)
99
+
100
+ # Apply feature engineering (same as training)
101
+ raw_df = pd.DataFrame([features])
102
+ eng_df = engineer_features(raw_df)
103
+
104
+ # Get predictions from all models
105
+ predictions = {}
106
+
107
+ if self.has_rf:
108
+ if self.rf_feature_names:
109
+ feature_values = [eng_df[fn].iloc[0] if fn in eng_df.columns
110
+ else features.get(fn, 0)
111
+ for fn in self.rf_feature_names]
112
+ X_rf = pd.DataFrame([dict(zip(self.rf_feature_names, feature_values))])
113
+ else:
114
+ X_rf = eng_df
115
+
116
+ rf_pred = self.rf_model.predict(X_rf)[0] # type: ignore
117
+ rf_proba = self.rf_model.predict_proba(X_rf)[0] # type: ignore
118
+ predictions['Random Forest'] = {
119
+ 'prediction': rf_pred,
120
+ 'probability': rf_proba
121
+ }
122
+
123
+ if self.has_xgb:
124
+ if self.xgb_feature_names:
125
+ feature_values = [eng_df[fn].iloc[0] if fn in eng_df.columns
126
+ else features.get(fn, 0)
127
+ for fn in self.xgb_feature_names]
128
+ X_xgb = pd.DataFrame([dict(zip(self.xgb_feature_names, feature_values))])
129
+ else:
130
+ X_xgb = eng_df
131
+
132
+ xgb_pred = self.xgb_model.predict(X_xgb)[0] # type: ignore
133
+ xgb_proba = self.xgb_model.predict_proba(X_xgb)[0] # type: ignore
134
+ predictions['XGBoost'] = {
135
+ 'prediction': xgb_pred,
136
+ 'probability': xgb_proba
137
+ }
138
+
139
+ # Ensemble prediction (average probabilities)
140
+ if len(predictions) > 1:
141
+ avg_proba = sum([p['probability'] for p in predictions.values()]) / len(predictions)
142
+ ensemble_pred = 1 if avg_proba[1] > 0.5 else 0 # type: ignore
143
+ predictions['Ensemble'] = {
144
+ 'prediction': ensemble_pred,
145
+ 'probability': avg_proba
146
+ }
147
+
148
+ # Display results
149
+ self._display_prediction(predictions, features, source)
150
+
151
+ return {
152
+ 'predictions': predictions,
153
+ 'features': features
154
+ }
155
+
156
+ def _display_prediction(self, predictions, features, source):
157
+ """Display prediction results with colors."""
158
+ print("\n" + "="*80)
159
+ if source:
160
+ print(f"Source: {source}")
161
+ print("="*80)
162
+
163
+ # Get ensemble or single prediction for final verdict
164
+ if 'Ensemble' in predictions:
165
+ final_pred = predictions['Ensemble']['prediction']
166
+ final_proba = predictions['Ensemble']['probability']
167
+ else:
168
+ # Use the only available model
169
+ model_name = list(predictions.keys())[0]
170
+ final_pred = predictions[model_name]['prediction']
171
+ final_proba = predictions[model_name]['probability']
172
+
173
+ # Final Verdict
174
+ if final_pred == 1:
175
+ print(f"\n{Fore.RED}{'⚠ PHISHING DETECTED ⚠':^80}")
176
+ print(f"{Fore.RED}Confidence: {final_proba[1]*100:.2f}%")
177
+ else:
178
+ print(f"\n{Fore.GREEN}{'✓ LEGITIMATE WEBSITE ✓':^80}")
179
+ print(f"{Fore.GREEN}Confidence: {final_proba[0]*100:.2f}%")
180
+
181
+ # Model breakdown
182
+ print("\n" + "-"*80)
183
+ print("Model Predictions:")
184
+ print("-"*80)
185
+
186
+ for model_name, result in predictions.items():
187
+ pred = result['prediction']
188
+ proba = result['probability']
189
+
190
+ pred_text = 'PHISHING' if pred == 1 else 'LEGITIMATE'
191
+ color = Fore.RED if pred == 1 else Fore.GREEN
192
+ icon = "⚠" if pred == 1 else "✓"
193
+
194
+ print(f" {icon} {model_name:15s}: {color}{pred_text:12s}{Style.RESET_ALL} "
195
+ f"(Legit: {proba[0]*100:5.1f}%, Phish: {proba[1]*100:5.1f}%)")
196
+
197
+ # Show key features
198
+ print("\n" + "-"*80)
199
+ print("Key HTML Features:")
200
+ print("-"*80)
201
+
202
+ important_features = [
203
+ ('num_forms', 'Number of forms'),
204
+ ('num_password_fields', 'Password fields'),
205
+ ('num_external_links', 'External links'),
206
+ ('num_scripts', 'Scripts'),
207
+ ('num_urgency_keywords', 'Urgency keywords'),
208
+ ('num_brand_mentions', 'Brand mentions'),
209
+ ('has_meta_refresh', 'Meta refresh redirect'),
210
+ ('num_iframes', 'Iframes'),
211
+ ]
212
+
213
+ for feat, desc in important_features:
214
+ if feat in features:
215
+ value = features[feat]
216
+ print(f" {desc:25s}: {value}")
217
+
218
+ print("="*80)
219
+
220
+
221
+ def interactive_mode():
222
+ """Interactive mode for testing multiple inputs."""
223
+ print("\n" + "="*80)
224
+ print(f"{Fore.CYAN}{'HTML PHISHING DETECTOR - INTERACTIVE MODE':^80}")
225
+ print("="*80)
226
+
227
+ # Load predictor
228
+ try:
229
+ predictor = HTMLPhishingPredictor()
230
+ except Exception as e:
231
+ print(f"{Fore.RED}Error loading model: {e}")
232
+ print("\nTrain a model first using:")
233
+ print(" python models/html_enhanced/random_forest_html.py")
234
+ return
235
+
236
+ print("\nCommands:")
237
+ print(" file <path> - Analyze HTML file")
238
+ print(" url <url> - Download and analyze URL")
239
+ print(" quit - Exit")
240
+ print("-"*80)
241
+
242
+ while True:
243
+ try:
244
+ user_input = input(f"\n{Fore.CYAN}Enter command: {Style.RESET_ALL}").strip()
245
+
246
+ if not user_input:
247
+ continue
248
+
249
+ if user_input.lower() in ['quit', 'exit', 'q']:
250
+ print("\nGoodbye!")
251
+ break
252
+
253
+ parts = user_input.split(maxsplit=1)
254
+ command = parts[0].lower()
255
+
256
+ if command == 'file' and len(parts) == 2:
257
+ file_path = parts[1].strip()
258
+ if Path(file_path).exists():
259
+ predictor.predict_from_file(file_path)
260
+ else:
261
+ print(f"{Fore.RED}File not found: {file_path}")
262
+
263
+ elif command == 'url' and len(parts) == 2:
264
+ url = parts[1].strip()
265
+ predictor.predict_from_url(url)
266
+
267
+ else:
268
+ print(f"{Fore.YELLOW}Invalid command. Use: file <path> or url <url>")
269
+
270
+ except KeyboardInterrupt:
271
+ print("\n\nGoodbye!")
272
+ break
273
+ except Exception as e:
274
+ print(f"{Fore.RED}Error: {e}")
275
+
276
+
277
+ def main():
278
+ """Main function."""
279
+ if len(sys.argv) > 1:
280
+ # Command line mode
281
+ predictor = HTMLPhishingPredictor()
282
+
283
+ arg = sys.argv[1]
284
+
285
+ if Path(arg).exists():
286
+ # File path
287
+ predictor.predict_from_file(arg)
288
+ elif arg.startswith('http'):
289
+ # URL
290
+ predictor.predict_from_url(arg)
291
+ else:
292
+ print(f"Invalid input: {arg}")
293
+ print("\nUsage:")
294
+ print(" python scripts/predict_html.py <html_file>")
295
+ print(" python scripts/predict_html.py <url>")
296
+ print(" python scripts/predict_html.py (interactive mode)")
297
+ else:
298
+ # Interactive mode
299
+ interactive_mode()
300
+
301
+
302
+ if __name__ == '__main__':
303
+ main()
scripts/predict_url.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Phishing Detector - Interactive Demo
3
+
4
+ Test any URL with all trained models and see predictions with confidence scores.
5
+ """
6
+
7
+ import sys
8
+ import pandas as pd
9
+ import joblib
10
+ from pathlib import Path
11
+ from colorama import init, Fore, Style
12
+
13
+ # Initialize colorama for colored output
14
+ init(autoreset=True)
15
+ import logging
16
+
17
+ # Setup logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(levelname)s - %(message)s',
21
+ datefmt='%H:%M:%S'
22
+ )
23
+ logger = logging.getLogger("url_predictor")
24
+
25
+ sys.path.append(str(Path(__file__).parent.parent))
26
+ from scripts.feature_extraction.url.url_features_v2 import URLFeatureExtractorV2
27
+
28
+
29
+ class URLPhishingDetector:
30
+ """Detect phishing URLs using trained models."""
31
+
32
+ def __init__(self):
33
+ """Initialize detector with all models."""
34
+ self.script_dir = Path(__file__).parent.parent
35
+ self.models_dir = (self.script_dir / 'saved_models').resolve()
36
+
37
+ # Whitelist of trusted domains
38
+ self.trusted_domains = {
39
+ # Tech giants
40
+ 'youtube.com', 'facebook.com', 'twitter.com', 'x.com',
41
+ 'linkedin.com', 'microsoft.com', 'apple.com', 'amazon.com',
42
+ # Development
43
+ 'github.com', 'gitlab.com', 'stackoverflow.com', 'npmjs.com',
44
+ # AI Services
45
+ 'claude.ai', 'anthropic.com', 'openai.com', 'chatgpt.com',
46
+ # Education & Info
47
+ 'wikipedia.org', 'reddit.com', 'quora.com', 'medium.com',
48
+ # Cloud & Services
49
+ 'aws.amazon.com', 'azure.microsoft.com', 'cloud.google.com',
50
+ 'vercel.com', 'netlify.com', 'heroku.com',
51
+ # Communication
52
+ 'slack.com', 'discord.com', 'zoom.us', 'teams.microsoft.com',
53
+ # Finance (major)
54
+ 'paypal.com', 'stripe.com', 'visa.com', 'mastercard.com',
55
+ # E-commerce
56
+ 'ebay.com', 'shopify.com', 'etsy.com', 'walmart.com',
57
+ }
58
+
59
+ # Custom thresholds for each model (reduce false positives)
60
+ self.thresholds = {
61
+ 'Logistic Regression': 0.5, # Standard threshold
62
+ 'Random Forest': 0.5, # Standard threshold
63
+ 'XGBoost': 0.5 # Standard threshold
64
+ }
65
+
66
+ # Load feature extractor
67
+ self.extractor = URLFeatureExtractorV2()
68
+
69
+ # Load scaler (only needed for Logistic Regression)
70
+ scaler_path = self.models_dir / 'scaler.joblib'
71
+ if scaler_path.exists():
72
+ self.scaler = joblib.load(scaler_path)
73
+ logger.info("✓ Loaded scaler")
74
+ else:
75
+ self.scaler = None
76
+ logger.warning("✗ Scaler not found (only needed for Logistic Regression)")
77
+
78
+ # Load all models
79
+ self.models = {}
80
+ self.feature_names = {}
81
+ self._load_models()
82
+
83
+ def _load_models(self):
84
+ """Load all trained models."""
85
+ model_files = {
86
+ 'Logistic Regression': 'logistic_regression.joblib',
87
+ 'Random Forest': 'random_forest.joblib',
88
+ 'XGBoost': 'xgboost.joblib'
89
+ }
90
+
91
+ for name, filename in model_files.items():
92
+ model_path = self.models_dir / filename
93
+ if model_path.exists():
94
+ model = joblib.load(model_path)
95
+ self.models[name] = model
96
+
97
+ # Store expected feature names from model
98
+ if hasattr(model, 'feature_names_in_'):
99
+ self.feature_names[name] = list(model.feature_names_in_)
100
+ logger.info(f"✓ Loaded {name} ({len(self.feature_names[name])} features)")
101
+ elif self.scaler is not None and hasattr(self.scaler, 'feature_names_in_'):
102
+ # Use scaler's feature names for models without them (like Logistic Regression)
103
+ self.feature_names[name] = list(self.scaler.feature_names_in_)
104
+ logger.info(f"✓ Loaded {name} (using scaler features: {len(self.feature_names[name])} features)")
105
+ else:
106
+ logger.info(f"✓ Loaded {name}")
107
+ else:
108
+ logger.warning(f"✗ Model not found: {filename}")
109
+
110
+ def predict_url(self, url: str) -> dict:
111
+ """
112
+ Predict if URL is phishing or legitimate.
113
+
114
+ Args:
115
+ url: URL string to analyze
116
+
117
+ Returns:
118
+ Dictionary with predictions from all models
119
+ """
120
+ # Check if domain is in whitelist
121
+ from urllib.parse import urlparse
122
+ parsed = urlparse(url)
123
+ domain = parsed.netloc.lower().replace('www.', '')
124
+
125
+ # If trusted domain, override predictions
126
+ is_whitelisted = any(domain.endswith(trusted) for trusted in self.trusted_domains)
127
+
128
+ # Extract features
129
+ features_dict = self.extractor.extract_features(url)
130
+
131
+ # Convert to DataFrame (excluding label)
132
+ features_df = pd.DataFrame([features_dict])
133
+ if 'label' in features_df.columns:
134
+ features_df = features_df.drop('label', axis=1)
135
+
136
+ # Get predictions from all models
137
+ results = {}
138
+ for model_name, model in self.models.items():
139
+ # Override for whitelisted domains
140
+ if is_whitelisted:
141
+ results[model_name] = {
142
+ 'prediction': 'LEGITIMATE',
143
+ 'prediction_code': 0,
144
+ 'confidence': 99.99,
145
+ 'phishing_probability': 0.01,
146
+ 'legitimate_probability': 99.99,
147
+ 'whitelisted': True
148
+ }
149
+ continue
150
+
151
+ # Align features with model's expected features
152
+ if model_name in self.feature_names:
153
+ expected_features = self.feature_names[model_name]
154
+ # Create aligned DataFrame with correct column order
155
+ features_aligned = pd.DataFrame(columns=expected_features)
156
+ for feat in expected_features:
157
+ if feat in features_df.columns:
158
+ features_aligned[feat] = features_df[feat].values
159
+ else:
160
+ features_aligned[feat] = 0 # Fill missing features with 0
161
+ # Convert to numpy to avoid sklearn feature name validation
162
+ features_to_predict = features_aligned.values
163
+ else:
164
+ # Fallback: use any stored feature names from other models
165
+ if self.feature_names:
166
+ expected_features = list(self.feature_names.values())[0]
167
+ features_aligned = pd.DataFrame(columns=expected_features)
168
+ for feat in expected_features:
169
+ if feat in features_df.columns:
170
+ features_aligned[feat] = features_df[feat].values
171
+ else:
172
+ features_aligned[feat] = 0
173
+ features_to_predict = features_aligned.values
174
+ else:
175
+ features_to_predict = features_df.values
176
+
177
+ # Scale features only for Logistic Regression
178
+ if model_name == 'Logistic Regression' and self.scaler is not None:
179
+ features_to_use = self.scaler.transform(features_to_predict)
180
+ else:
181
+ features_to_use = features_to_predict
182
+
183
+ # Get probability/confidence (features are already numpy arrays)
184
+ if hasattr(model, 'predict_proba'):
185
+ probabilities = model.predict_proba(features_to_use)[0]
186
+ phishing_prob = probabilities[1] * 100
187
+ legitimate_prob = probabilities[0] * 100
188
+
189
+ # Apply custom threshold
190
+ threshold = self.thresholds.get(model_name, 0.5)
191
+ prediction = 1 if probabilities[1] > threshold else 0
192
+ confidence = probabilities[prediction] * 100
193
+ else:
194
+ # For models without predict_proba (fallback)
195
+ prediction = model.predict(features_to_use)[0]
196
+ confidence = 100.0
197
+ phishing_prob = 100.0 if prediction == 1 else 0.0
198
+ legitimate_prob = 0.0 if prediction == 1 else 100.0
199
+
200
+ results[model_name] = {
201
+ 'prediction': 'PHISHING' if prediction == 1 else 'LEGITIMATE',
202
+ 'prediction_code': int(prediction),
203
+ 'confidence': confidence,
204
+ 'phishing_probability': phishing_prob,
205
+ 'legitimate_probability': legitimate_prob,
206
+ 'whitelisted': False,
207
+ 'threshold': self.thresholds.get(model_name, 0.5)
208
+ }
209
+
210
+ return results, features_dict # type: ignore
211
+
212
+ def print_results(self, url: str, results: dict, features: dict):
213
+ """Print formatted results."""
214
+ print("\n" + "=" * 80)
215
+ print(f"{Fore.CYAN}{Style.BRIGHT}URL PHISHING DETECTION RESULTS{Style.RESET_ALL}")
216
+ print("=" * 80)
217
+
218
+ # Print URL
219
+ print(f"\n{Fore.YELLOW}URL:{Style.RESET_ALL} {url}")
220
+
221
+ # Print model predictions
222
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}MODEL PREDICTIONS:{Style.RESET_ALL}")
223
+ print("-" * 80)
224
+
225
+ for model_name, result in results.items():
226
+ prediction = result['prediction']
227
+ confidence = result['confidence']
228
+ phishing_prob = result['phishing_probability']
229
+ legitimate_prob = result['legitimate_probability']
230
+ threshold = result.get('threshold', 0.5)
231
+
232
+ # Color based on prediction
233
+ if prediction == 'PHISHING':
234
+ color = Fore.RED
235
+ icon = "⚠️"
236
+ else:
237
+ color = Fore.GREEN
238
+ icon = "✓"
239
+
240
+ print(f"\n{Style.BRIGHT}{model_name}:{Style.RESET_ALL}")
241
+ print(f" {icon} Prediction: {color}{Style.BRIGHT}{prediction}{Style.RESET_ALL}")
242
+
243
+ # Show if whitelisted
244
+ if result.get('whitelisted', False):
245
+ print(f" {Fore.CYAN}ℹ️ Trusted domain (whitelisted){Style.RESET_ALL}")
246
+ else:
247
+ print(f" Decision Threshold: {threshold*100:.0f}%")
248
+
249
+ print(f" Confidence: {confidence:.2f}%")
250
+ print(f" Probabilities:")
251
+ print(f" • Phishing: {Fore.RED}{phishing_prob:6.2f}%{Style.RESET_ALL}")
252
+ print(f" • Legitimate: {Fore.GREEN}{legitimate_prob:6.2f}%{Style.RESET_ALL}")
253
+
254
+ # Consensus
255
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}CONSENSUS:{Style.RESET_ALL}")
256
+ print("-" * 80)
257
+
258
+ phishing_votes = sum(1 for r in results.values() if r['prediction'] == 'PHISHING')
259
+ total_models = len(results)
260
+
261
+ if phishing_votes == total_models:
262
+ consensus_color = Fore.RED
263
+ consensus_icon = "🚨"
264
+ consensus_text = "ALL MODELS AGREE: PHISHING"
265
+ elif phishing_votes == 0:
266
+ consensus_color = Fore.GREEN
267
+ consensus_icon = "✅"
268
+ consensus_text = "ALL MODELS AGREE: LEGITIMATE"
269
+ else:
270
+ consensus_color = Fore.YELLOW
271
+ consensus_icon = "⚠️"
272
+ consensus_text = f"MIXED RESULTS: {phishing_votes}/{total_models} models say PHISHING"
273
+
274
+ print(f"{consensus_icon} {consensus_color}{Style.BRIGHT}{consensus_text}{Style.RESET_ALL}")
275
+
276
+ # Key features (based on top features from models)
277
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}TOP FEATURES (Model Importance):{Style.RESET_ALL}")
278
+ print("-" * 80)
279
+
280
+ # Top features from Random Forest and XGBoost analysis
281
+ top_features = [
282
+ ('Num Domain Parts', features.get('num_domain_parts', 0), None),
283
+ ('Domain Dots', features.get('domain_dots', 0), None),
284
+ ('URL Shortener', '✓ Yes' if features.get('is_shortened', 0) == 1 else '✗ No',
285
+ features.get('is_shortened', 0)),
286
+ ('Num Subdomains', features.get('num_subdomains', 0), None),
287
+ ('Domain Hyphens', features.get('domain_hyphens', 0), None),
288
+ ('Free Platform', '✓ Yes' if features.get('is_free_platform', 0) == 1 else '✗ No',
289
+ features.get('is_free_platform', 0)),
290
+ ('Free Hosting', '✓ Yes' if features.get('is_free_hosting', 0) == 1 else '✗ No',
291
+ features.get('is_free_hosting', 0)),
292
+ ('Platform Subdomain Len', features.get('platform_subdomain_length', 0), None),
293
+ ('Avg Domain Part Len', f"{features.get('avg_domain_part_len', 0):.2f}", None),
294
+ ('Domain Length Category', features.get('domain_length_category', 0), None),
295
+ ('Path Digits', features.get('path_digits', 0), None),
296
+ ('Is HTTP', '✓ Yes' if features.get('is_http', 0) == 1 else '✗ No',
297
+ features.get('is_http', 0)),
298
+ ('Multiple Brands in URL', '✓ Yes' if features.get('multiple_brands_in_url', 0) == 1 else '✗ No',
299
+ features.get('multiple_brands_in_url', 0)),
300
+ ('Brand in Path', '✓ Yes' if features.get('brand_in_path', 0) == 1 else '✗ No',
301
+ features.get('brand_in_path', 0)),
302
+ ('Path Slashes', features.get('path_slashes', 0), None),
303
+ ('Encoding Diff', f"{features.get('encoding_diff', 0):.3f}", None),
304
+ ('Symbol Ratio (Domain)', f"{features.get('symbol_ratio_domain', 0):.3f}", None),
305
+ ('Domain Length', features.get('domain_length', 0), None),
306
+ ('Has @ Symbol', '✓ Yes' if features.get('has_at_symbol', 0) == 1 else '✗ No',
307
+ features.get('has_at_symbol', 0)),
308
+ ('TLD Length', features.get('tld_length', 0), None),
309
+ ]
310
+
311
+ for feature_name, value, risk_flag in top_features:
312
+ # Color code risky features
313
+ if risk_flag is not None:
314
+ if risk_flag == 1: # Risky feature is present
315
+ value_display = f"{Fore.RED}{value}{Style.RESET_ALL}"
316
+ else:
317
+ value_display = f"{Fore.GREEN}{value}{Style.RESET_ALL}"
318
+ else:
319
+ value_display = str(value)
320
+
321
+ print(f" • {feature_name:25s}: {value_display}")
322
+
323
+ print("\n" + "=" * 80 + "\n")
324
+
325
+
326
+ def main():
327
+ """Main interactive function."""
328
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}╔══════════════════════════════════════════════════════════════╗")
329
+ print(f"║ URL PHISHING DETECTOR - INTERACTIVE DEMO ║")
330
+ print(f"╚══════════════════════════════════════════════════════════════╝{Style.RESET_ALL}\n")
331
+
332
+ # Initialize detector
333
+ print(f"{Fore.YELLOW}Loading models...{Style.RESET_ALL}")
334
+ detector = URLPhishingDetector()
335
+ print(f"{Fore.GREEN}✓ All models loaded successfully!{Style.RESET_ALL}\n")
336
+
337
+ # Interactive loop
338
+ while True:
339
+ print(f"{Fore.CYAN}{'─' * 80}{Style.RESET_ALL}")
340
+ url = input(f"{Fore.YELLOW}Enter URL to test (or 'quit' to exit):{Style.RESET_ALL} ").strip()
341
+
342
+ if url.lower() in ['quit', 'exit', 'q']:
343
+ print(f"\n{Fore.GREEN}Thank you for using URL Phishing Detector!{Style.RESET_ALL}\n")
344
+ break
345
+
346
+ if not url:
347
+ print(f"{Fore.RED}Please enter a valid URL{Style.RESET_ALL}\n")
348
+ continue
349
+
350
+ # Add http:// if no scheme
351
+ if not url.startswith(('http://', 'https://')):
352
+ url = 'http://' + url
353
+
354
+ try:
355
+ # Get predictions
356
+ results, features = detector.predict_url(url)
357
+
358
+ # Print results
359
+ detector.print_results(url, results, features)
360
+
361
+ except Exception as e:
362
+ print(f"\n{Fore.RED}Error analyzing URL: {str(e)}{Style.RESET_ALL}\n")
363
+ logger.error(f"Error: {str(e)}")
364
+
365
+
366
+ if __name__ == "__main__":
367
+ main()
scripts/predict_url_cnn.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CNN Phishing Detector - Interactive Demo
3
+
4
+ Test any URL with both character-level CNN models:
5
+ 1. CNN URL — analyzes the URL string itself
6
+ 2. CNN HTML — fetches the page and analyzes its HTML source
7
+
8
+ Usage:
9
+ python scripts/predict_url_cnn.py
10
+ """
11
+
12
+ import sys
13
+ import json
14
+ import logging
15
+ import warnings
16
+ from pathlib import Path
17
+
18
+ import os
19
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
20
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
21
+
22
+ import numpy as np
23
+ from colorama import init, Fore, Style
24
+
25
+ init(autoreset=True)
26
+ warnings.filterwarnings('ignore')
27
+
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(levelname)s - %(message)s',
31
+ datefmt='%H:%M:%S',
32
+ )
33
+ logger = logging.getLogger('cnn_predictor')
34
+
35
+ # ---------------------------------------------------------------------------
36
+ # Project paths
37
+ # ---------------------------------------------------------------------------
38
+ PROJECT_ROOT = Path(__file__).resolve().parents[1] # src/
39
+ MODELS_DIR = PROJECT_ROOT / 'saved_models'
40
+
41
+ # URL CNN
42
+ URL_MODEL_PATH = MODELS_DIR / 'cnn_url_model.keras'
43
+ URL_VOCAB_PATH = MODELS_DIR / 'cnn_url_vocab.json'
44
+
45
+ # HTML CNN
46
+ HTML_MODEL_PATH = MODELS_DIR / 'cnn_html_model.keras'
47
+ HTML_VOCAB_PATH = MODELS_DIR / 'cnn_html_vocab.json'
48
+
49
+
50
+ class CNNPhishingDetector:
51
+ """Detect phishing URLs using both character-level CNN models."""
52
+
53
+ def __init__(self):
54
+ self.url_model = None
55
+ self.html_model = None
56
+ self.url_vocab = None
57
+ self.html_vocab = None
58
+
59
+ self._load_url_model()
60
+ self._load_html_model()
61
+
62
+ # ── Loading ────────────────────────────────────────────────────
63
+
64
+ def _load_url_model(self):
65
+ """Load URL CNN model and vocabulary."""
66
+ if not URL_VOCAB_PATH.exists() or not URL_MODEL_PATH.exists():
67
+ logger.warning("URL CNN model not found — skipping")
68
+ return
69
+
70
+ with open(URL_VOCAB_PATH, 'r') as f:
71
+ self.url_vocab = json.load(f)
72
+
73
+ import tensorflow as tf
74
+ self.url_model = tf.keras.models.load_model(str(URL_MODEL_PATH))
75
+ logger.info(f"✓ URL CNN loaded (vocab={self.url_vocab['vocab_size']}, "
76
+ f"max_len={self.url_vocab['max_len']})")
77
+
78
+ def _load_html_model(self):
79
+ """Load HTML CNN model and vocabulary."""
80
+ if not HTML_VOCAB_PATH.exists() or not HTML_MODEL_PATH.exists():
81
+ logger.warning("HTML CNN model not found — skipping")
82
+ return
83
+
84
+ with open(HTML_VOCAB_PATH, 'r') as f:
85
+ self.html_vocab = json.load(f)
86
+
87
+ import tensorflow as tf
88
+ self.html_model = tf.keras.models.load_model(str(HTML_MODEL_PATH))
89
+ logger.info(f"✓ HTML CNN loaded (vocab={self.html_vocab['vocab_size']}, "
90
+ f"max_len={self.html_vocab['max_len']})")
91
+
92
+ # ── Encoding ───────────────────────────────────────────────────
93
+
94
+ def _encode_url(self, url: str) -> np.ndarray:
95
+ """Encode a URL string for the URL CNN."""
96
+ char_to_idx = self.url_vocab['char_to_idx']
97
+ max_len = self.url_vocab['max_len']
98
+ encoded = [char_to_idx.get(c, 1) for c in url[:max_len]]
99
+ encoded += [0] * (max_len - len(encoded))
100
+ return np.array([encoded], dtype=np.int32)
101
+
102
+ def _encode_html(self, html: str) -> np.ndarray:
103
+ """Encode an HTML string for the HTML CNN."""
104
+ char_to_idx = self.html_vocab['char_to_idx']
105
+ max_len = self.html_vocab['max_len']
106
+ encoded = [char_to_idx.get(c, 1) for c in html[:max_len]]
107
+ encoded += [0] * (max_len - len(encoded))
108
+ return np.array([encoded], dtype=np.int32)
109
+
110
+ # ── HTML fetching ──────────────────────────────────────────────
111
+
112
+ @staticmethod
113
+ def _fetch_html(url: str, timeout: int = 10) -> str | None:
114
+ """Fetch HTML content from a URL. Returns None on failure."""
115
+ try:
116
+ import requests
117
+ headers = {
118
+ 'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
119
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
120
+ 'Chrome/120.0.0.0 Safari/537.36'),
121
+ }
122
+ resp = requests.get(url, headers=headers, timeout=timeout,
123
+ verify=False, allow_redirects=True)
124
+ resp.raise_for_status()
125
+ return resp.text
126
+ except Exception as e:
127
+ logger.warning(f" Could not fetch HTML: {e}")
128
+ return None
129
+
130
+ # ── Prediction ─────────────────────────────���───────────────────
131
+
132
+ def predict_url(self, url: str, threshold: float = 0.5) -> dict | None:
133
+ """Predict using the URL CNN model."""
134
+ if self.url_model is None:
135
+ return None
136
+
137
+ X = self._encode_url(url)
138
+ phishing_prob = float(self.url_model.predict(X, verbose=0)[0][0])
139
+ legitimate_prob = 1.0 - phishing_prob
140
+ is_phishing = phishing_prob >= threshold
141
+
142
+ return {
143
+ 'model_name': 'CNN URL (Char-level)',
144
+ 'prediction': 'PHISHING' if is_phishing else 'LEGITIMATE',
145
+ 'prediction_code': int(is_phishing),
146
+ 'confidence': (phishing_prob if is_phishing else legitimate_prob) * 100,
147
+ 'phishing_probability': phishing_prob * 100,
148
+ 'legitimate_probability': legitimate_prob * 100,
149
+ 'threshold': threshold,
150
+ }
151
+
152
+ def predict_html(self, html: str, threshold: float = 0.5) -> dict | None:
153
+ """Predict using the HTML CNN model."""
154
+ if self.html_model is None:
155
+ return None
156
+
157
+ X = self._encode_html(html)
158
+ phishing_prob = float(self.html_model.predict(X, verbose=0)[0][0])
159
+ legitimate_prob = 1.0 - phishing_prob
160
+ is_phishing = phishing_prob >= threshold
161
+
162
+ return {
163
+ 'model_name': 'CNN HTML (Char-level)',
164
+ 'prediction': 'PHISHING' if is_phishing else 'LEGITIMATE',
165
+ 'prediction_code': int(is_phishing),
166
+ 'confidence': (phishing_prob if is_phishing else legitimate_prob) * 100,
167
+ 'phishing_probability': phishing_prob * 100,
168
+ 'legitimate_probability': legitimate_prob * 100,
169
+ 'threshold': threshold,
170
+ 'html_length': len(html),
171
+ }
172
+
173
+ def predict_full(self, url: str, threshold: float = 0.5) -> dict:
174
+ """
175
+ Run both CNN models on a URL.
176
+
177
+ Returns dict with url_result, html_result, and combined verdict.
178
+ """
179
+ # URL CNN
180
+ url_result = self.predict_url(url, threshold)
181
+
182
+ # HTML CNN — fetch page first
183
+ html_result = None
184
+ html_content = None
185
+ if self.html_model is not None:
186
+ html_content = self._fetch_html(url)
187
+ if html_content and len(html_content) >= 100:
188
+ html_result = self.predict_html(html_content, threshold)
189
+
190
+ # Combined verdict
191
+ results = [r for r in [url_result, html_result] if r is not None]
192
+ if len(results) == 2:
193
+ avg_phish = (url_result['phishing_probability'] +
194
+ html_result['phishing_probability']) / 2
195
+ combined_is_phishing = avg_phish >= (threshold * 100)
196
+ combined = {
197
+ 'prediction': 'PHISHING' if combined_is_phishing else 'LEGITIMATE',
198
+ 'phishing_probability': avg_phish,
199
+ 'legitimate_probability': 100 - avg_phish,
200
+ 'confidence': avg_phish if combined_is_phishing else 100 - avg_phish,
201
+ 'agree': url_result['prediction'] == html_result['prediction'],
202
+ }
203
+ elif len(results) == 1:
204
+ r = results[0]
205
+ combined = {
206
+ 'prediction': r['prediction'],
207
+ 'phishing_probability': r['phishing_probability'],
208
+ 'legitimate_probability': r['legitimate_probability'],
209
+ 'confidence': r['confidence'],
210
+ 'agree': True,
211
+ }
212
+ else:
213
+ combined = None
214
+
215
+ return {
216
+ 'url_result': url_result,
217
+ 'html_result': html_result,
218
+ 'html_fetched': html_content is not None,
219
+ 'html_length': len(html_content) if html_content else 0,
220
+ 'combined': combined,
221
+ }
222
+
223
+ # ── Pretty print ───────────────────────────────────────────────
224
+
225
+ def print_results(self, url: str, full: dict):
226
+ """Print formatted prediction results from both models."""
227
+ print("\n" + "=" * 80)
228
+ print(f"{Fore.CYAN}{Style.BRIGHT}CNN PHISHING DETECTION RESULTS{Style.RESET_ALL}")
229
+ print("=" * 80)
230
+ print(f"\n{Fore.YELLOW}URL:{Style.RESET_ALL} {url}")
231
+
232
+ # ── URL CNN ──
233
+ url_r = full['url_result']
234
+ if url_r:
235
+ pred = url_r['prediction']
236
+ color = Fore.RED if pred == 'PHISHING' else Fore.GREEN
237
+ icon = "⚠️" if pred == 'PHISHING' else "✓"
238
+ print(f"\n{Style.BRIGHT}1. CNN URL (Character-level):{Style.RESET_ALL}")
239
+ print(f" {icon} Prediction: {color}{Style.BRIGHT}{pred}{Style.RESET_ALL}")
240
+ print(f" Confidence: {url_r['confidence']:.2f}%")
241
+ print(f" Phishing: {Fore.RED}{url_r['phishing_probability']:6.2f}%{Style.RESET_ALL}")
242
+ print(f" Legitimate: {Fore.GREEN}{url_r['legitimate_probability']:6.2f}%{Style.RESET_ALL}")
243
+ else:
244
+ print(f"\n{Style.BRIGHT}1. CNN URL:{Style.RESET_ALL} {Fore.YELLOW}Not available{Style.RESET_ALL}")
245
+
246
+ # ── HTML CNN ──
247
+ html_r = full['html_result']
248
+ if html_r:
249
+ pred = html_r['prediction']
250
+ color = Fore.RED if pred == 'PHISHING' else Fore.GREEN
251
+ icon = "⚠️" if pred == 'PHISHING' else "✓"
252
+ print(f"\n{Style.BRIGHT}2. CNN HTML (Character-level):{Style.RESET_ALL}")
253
+ print(f" {icon} Prediction: {color}{Style.BRIGHT}{pred}{Style.RESET_ALL}")
254
+ print(f" Confidence: {html_r['confidence']:.2f}%")
255
+ print(f" Phishing: {Fore.RED}{html_r['phishing_probability']:6.2f}%{Style.RESET_ALL}")
256
+ print(f" Legitimate: {Fore.GREEN}{html_r['legitimate_probability']:6.2f}%{Style.RESET_ALL}")
257
+ print(f" HTML length: {html_r['html_length']:,} chars")
258
+ elif full['html_fetched']:
259
+ print(f"\n{Style.BRIGHT}2. CNN HTML:{Style.RESET_ALL} "
260
+ f"{Fore.YELLOW}HTML too short for analysis{Style.RESET_ALL}")
261
+ else:
262
+ print(f"\n{Style.BRIGHT}2. CNN HTML:{Style.RESET_ALL} "
263
+ f"{Fore.YELLOW}Could not fetch page HTML{Style.RESET_ALL}")
264
+
265
+ # ── Combined verdict ──
266
+ combined = full['combined']
267
+ if combined:
268
+ pred = combined['prediction']
269
+ color = Fore.RED if pred == 'PHISHING' else Fore.GREEN
270
+ icon = "⚠️" if pred == 'PHISHING' else "✓"
271
+ agree_str = (f"{Fore.GREEN}YES{Style.RESET_ALL}" if combined['agree']
272
+ else f"{Fore.YELLOW}NO{Style.RESET_ALL}")
273
+
274
+ print(f"\n{'─' * 80}")
275
+ print(f"{Style.BRIGHT}COMBINED VERDICT:{Style.RESET_ALL}")
276
+ print(f" {icon} {color}{Style.BRIGHT}{pred}{Style.RESET_ALL} "
277
+ f"(confidence: {combined['confidence']:.2f}%)")
278
+ print(f" Phishing: {Fore.RED}{combined['phishing_probability']:6.2f}%{Style.RESET_ALL}")
279
+ print(f" Legitimate: {Fore.GREEN}{combined['legitimate_probability']:6.2f}%{Style.RESET_ALL}")
280
+ if url_r and html_r:
281
+ print(f" Models agree: {agree_str}")
282
+
283
+ print("\n" + "=" * 80 + "\n")
284
+
285
+
286
+ def main():
287
+ """Interactive prediction loop."""
288
+ print(f"\n{Fore.CYAN}{Style.BRIGHT}╔══════════════════════════════════════════════════════════════╗")
289
+ print(f"║ CNN PHISHING DETECTOR - INTERACTIVE DEMO ║")
290
+ print(f"║ URL CNN + HTML CNN (Dual Analysis) ║")
291
+ print(f"╚══════════════════════════════════════════════════════════════╝{Style.RESET_ALL}\n")
292
+
293
+ print(f"{Fore.YELLOW}Loading CNN models...{Style.RESET_ALL}")
294
+ detector = CNNPhishingDetector()
295
+
296
+ available = []
297
+ if detector.url_model is not None:
298
+ available.append("URL CNN")
299
+ if detector.html_model is not None:
300
+ available.append("HTML CNN")
301
+
302
+ if not available:
303
+ print(f"{Fore.RED}No CNN models found! Train models first.{Style.RESET_ALL}")
304
+ sys.exit(1)
305
+
306
+ print(f"{Fore.GREEN}✓ Models loaded: {', '.join(available)}{Style.RESET_ALL}\n")
307
+
308
+ while True:
309
+ print(f"{Fore.CYAN}{'─' * 80}{Style.RESET_ALL}")
310
+ url = input(f"{Fore.YELLOW}Enter URL to test (or 'quit' to exit):{Style.RESET_ALL} ").strip()
311
+
312
+ if url.lower() in ('quit', 'exit', 'q'):
313
+ print(f"\n{Fore.GREEN}Goodbye!{Style.RESET_ALL}\n")
314
+ break
315
+
316
+ if not url:
317
+ print(f"{Fore.RED}Please enter a valid URL.{Style.RESET_ALL}\n")
318
+ continue
319
+
320
+ if not url.startswith(('http://', 'https://')):
321
+ url = 'http://' + url
322
+
323
+ try:
324
+ full = detector.predict_full(url)
325
+ detector.print_results(url, full)
326
+ except Exception as e:
327
+ print(f"\n{Fore.RED}Error: {e}{Style.RESET_ALL}\n")
328
+ logger.error(str(e))
329
+
330
+
331
+ if __name__ == '__main__':
332
+ main()
scripts/testing/data_leakage_test.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Leakage Detection Script
3
+
4
+ Checks for common data leakage issues:
5
+ 1. Duplicate URLs in train/test split
6
+ 2. Feature extraction timing (done before split - CORRECT)
7
+ 3. Scaler fitting (only on train data - CORRECT)
8
+ 4. Feature contamination checks
9
+ """
10
+
11
+ import pandas as pd
12
+ import numpy as np
13
+ from pathlib import Path
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.preprocessing import StandardScaler
16
+ import logging
17
+
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s - %(levelname)s - %(message)s',
21
+ datefmt='%H:%M:%S'
22
+ )
23
+ logger = logging.getLogger("data_leakage_check")
24
+
25
+
26
+ def check_1_duplicate_urls_in_splits():
27
+ """Check if same URLs appear in both train and test sets."""
28
+ logger.info("\n" + "="*80)
29
+ logger.info("CHECK 1: DUPLICATE URLs IN TRAIN/TEST SPLITS")
30
+ logger.info("="*80)
31
+
32
+ # Load original dataset with URLs
33
+ data_dir = Path('data/processed')
34
+ original_df = pd.read_csv(data_dir / 'clean_dataset_no_duplicates.csv')
35
+
36
+ logger.info(f"\nOriginal dataset: {len(original_df):,} URLs")
37
+
38
+ # Check for duplicates in original dataset
39
+ duplicates = original_df['url'].duplicated().sum()
40
+ logger.info(f"Duplicates in original dataset: {duplicates}")
41
+
42
+ if duplicates > 0:
43
+ logger.warning(f"⚠️ Found {duplicates} duplicate URLs in original dataset!")
44
+ dup_urls = original_df[original_df['url'].duplicated(keep=False)]['url'].value_counts()
45
+ logger.info(f"Top duplicated URLs:\n{dup_urls.head(10)}")
46
+ else:
47
+ logger.info("✓ No duplicates in original dataset")
48
+
49
+ # Simulate train/test split (same as in training)
50
+ X = original_df['url']
51
+ y = original_df['label']
52
+
53
+ X_train, X_test, y_train, y_test = train_test_split(
54
+ X, y, test_size=0.2, random_state=42, stratify=y
55
+ )
56
+
57
+ logger.info(f"\nTrain set: {len(X_train):,} URLs")
58
+ logger.info(f"Test set: {len(X_test):,} URLs")
59
+
60
+ # Check for overlap
61
+ train_urls = set(X_train)
62
+ test_urls = set(X_test)
63
+ overlap = train_urls.intersection(test_urls)
64
+
65
+ logger.info(f"\nOverlapping URLs between train/test: {len(overlap)}")
66
+
67
+ if len(overlap) > 0:
68
+ logger.error(f"❌ DATA LEAKAGE DETECTED! {len(overlap)} URLs in both train and test!")
69
+ logger.info(f"Sample overlapping URLs:\n{list(overlap)[:5]}")
70
+ return False
71
+ else:
72
+ logger.info("✓ No URL overlap between train and test sets")
73
+ return True
74
+
75
+
76
+ def check_2_feature_extraction_timing():
77
+ """Check if features were extracted before split (CORRECT) or after (WRONG)."""
78
+ logger.info("\n" + "="*80)
79
+ logger.info("CHECK 2: FEATURE EXTRACTION TIMING")
80
+ logger.info("="*80)
81
+
82
+ # Load feature dataset
83
+ features_df = pd.read_csv('data/features/url_features.csv')
84
+
85
+ logger.info(f"\nFeature dataset: {len(features_df):,} rows")
86
+ logger.info(f"Features: {len(features_df.columns) - 1}")
87
+
88
+ # Load original dataset
89
+ original_df = pd.read_csv('data/processed/clean_dataset.csv')
90
+
91
+ logger.info(f"Original dataset: {len(original_df):,} rows")
92
+
93
+ # Check sizes match
94
+ if len(features_df) == len(original_df):
95
+ logger.info("✓ Feature extraction done on ENTIRE dataset (before split)")
96
+ logger.info(" This is CORRECT - prevents data leakage")
97
+ return True
98
+ else:
99
+ logger.warning("⚠️ Dataset sizes don't match - check extraction process")
100
+ logger.info(f" Difference: {abs(len(features_df) - len(original_df))}")
101
+ return False
102
+
103
+
104
+ def check_3_scaler_fitting():
105
+ """Check if scaler was fitted only on train data."""
106
+ logger.info("\n" + "="*80)
107
+ logger.info("CHECK 3: SCALER FITTING (Logistic Regression only)")
108
+ logger.info("="*80)
109
+
110
+ # Load features
111
+ features_df = pd.read_csv('data/features/url_features.csv')
112
+
113
+ X = features_df.drop('label', axis=1)
114
+ y = features_df['label']
115
+
116
+ # Split
117
+ X_train, X_test, y_train, y_test = train_test_split(
118
+ X, y, test_size=0.2, random_state=42, stratify=y
119
+ )
120
+
121
+ # CORRECT way: fit on train, transform both
122
+ scaler_correct = StandardScaler()
123
+ X_train_scaled_correct = scaler_correct.fit_transform(X_train)
124
+ X_test_scaled_correct = scaler_correct.transform(X_test)
125
+
126
+ # WRONG way: fit on all data
127
+ scaler_wrong = StandardScaler()
128
+ X_all_scaled_wrong = scaler_wrong.fit_transform(X)
129
+ X_train_wrong = X_all_scaled_wrong[:len(X_train)]
130
+ X_test_wrong = X_all_scaled_wrong[len(X_train):]
131
+
132
+ # Compare statistics
133
+ logger.info("\nScaler statistics comparison:")
134
+ logger.info("\nCORRECT (fitted on train only):")
135
+ logger.info(f" Train mean: {scaler_correct.mean_[:5]}")
136
+ logger.info(f" Train std: {scaler_correct.scale_[:5]}")
137
+
138
+ logger.info("\nWRONG (fitted on all data):")
139
+ logger.info(f" All mean: {scaler_wrong.mean_[:5]}")
140
+ logger.info(f" All std: {scaler_wrong.scale_[:5]}")
141
+
142
+ # Check difference
143
+ mean_diff = np.abs(scaler_correct.mean_ - scaler_wrong.mean_).mean()
144
+ std_diff = np.abs(scaler_correct.scale_ - scaler_wrong.scale_).mean()
145
+
146
+ logger.info(f"\nAverage difference:")
147
+ logger.info(f" Mean: {mean_diff:.6f}")
148
+ logger.info(f" Std: {std_diff:.6f}")
149
+
150
+ if mean_diff < 0.01 and std_diff < 0.01:
151
+ logger.info("✓ Minimal difference - scaler likely fitted correctly on train only")
152
+ return True
153
+ else:
154
+ logger.warning("⚠️ Significant difference detected - review scaler fitting")
155
+ return False
156
+
157
+
158
+ def check_4_feature_contamination():
159
+ """Check for features that could leak information."""
160
+ logger.info("\n" + "="*80)
161
+ logger.info("CHECK 4: FEATURE CONTAMINATION")
162
+ logger.info("="*80)
163
+
164
+ features_df = pd.read_csv('data/features/url_features.csv')
165
+
166
+ # Check for suspiciously perfect features
167
+ logger.info("\nChecking for suspiciously perfect correlations with label...")
168
+
169
+ X = features_df.drop('label', axis=1)
170
+ y = features_df['label']
171
+
172
+ correlations = X.corrwith(y).abs().sort_values(ascending=False)
173
+
174
+ logger.info("\nTop 10 features correlated with label:")
175
+ for feat, corr in correlations.head(10).items():
176
+ logger.info(f" {feat:30s}: {corr:.4f}")
177
+
178
+ # Check for suspiciously high correlations (> 0.9 is suspicious)
179
+ suspicious = correlations[correlations > 0.9]
180
+
181
+ if len(suspicious) > 0:
182
+ logger.warning(f"⚠️ Found {len(suspicious)} features with >0.9 correlation!")
183
+ logger.warning(f" These might be leaking information:\n{suspicious}")
184
+ return False
185
+ else:
186
+ logger.info("✓ No suspiciously high correlations detected")
187
+ return True
188
+
189
+
190
+ def check_5_train_test_distribution():
191
+ """Check if train/test have similar distributions."""
192
+ logger.info("\n" + "="*80)
193
+ logger.info("CHECK 5: TRAIN/TEST DISTRIBUTION SIMILARITY")
194
+ logger.info("="*80)
195
+
196
+ features_df = pd.read_csv('data/features/url_features.csv')
197
+
198
+ X = features_df.drop('label', axis=1)
199
+ y = features_df['label']
200
+
201
+ X_train, X_test, y_train, y_test = train_test_split(
202
+ X, y, test_size=0.2, random_state=42, stratify=y
203
+ )
204
+
205
+ # Check label distribution
206
+ logger.info("\nLabel distribution:")
207
+ logger.info(f" Train: {y_train.value_counts().to_dict()}")
208
+ logger.info(f" Test: {y_test.value_counts().to_dict()}")
209
+
210
+ train_phishing_ratio = (y_train == 1).sum() / len(y_train)
211
+ test_phishing_ratio = (y_test == 1).sum() / len(y_test)
212
+
213
+ logger.info(f"\nPhishing ratio:")
214
+ logger.info(f" Train: {train_phishing_ratio:.4f}")
215
+ logger.info(f" Test: {test_phishing_ratio:.4f}")
216
+ logger.info(f" Difference: {abs(train_phishing_ratio - test_phishing_ratio):.4f}")
217
+
218
+ if abs(train_phishing_ratio - test_phishing_ratio) < 0.01:
219
+ logger.info("✓ Train/test distributions are well balanced")
220
+ return True
221
+ else:
222
+ logger.warning("⚠️ Train/test distributions differ significantly")
223
+ return False
224
+
225
+
226
+ def main():
227
+ """Run all data leakage checks."""
228
+ logger.info("="*80)
229
+ logger.info("DATA LEAKAGE DETECTION")
230
+ logger.info("="*80)
231
+
232
+ results = {}
233
+
234
+ try:
235
+ results['duplicates'] = check_1_duplicate_urls_in_splits()
236
+ except Exception as e:
237
+ logger.error(f"Error in duplicate check: {e}")
238
+ results['duplicates'] = None
239
+
240
+ try:
241
+ results['extraction_timing'] = check_2_feature_extraction_timing()
242
+ except Exception as e:
243
+ logger.error(f"Error in extraction timing check: {e}")
244
+ results['extraction_timing'] = None
245
+
246
+ try:
247
+ results['scaler'] = check_3_scaler_fitting()
248
+ except Exception as e:
249
+ logger.error(f"Error in scaler check: {e}")
250
+ results['scaler'] = None
251
+
252
+ try:
253
+ results['contamination'] = check_4_feature_contamination()
254
+ except Exception as e:
255
+ logger.error(f"Error in contamination check: {e}")
256
+ results['contamination'] = None
257
+
258
+ try:
259
+ results['distribution'] = check_5_train_test_distribution()
260
+ except Exception as e:
261
+ logger.error(f"Error in distribution check: {e}")
262
+ results['distribution'] = None
263
+
264
+ # Final summary
265
+ logger.info("\n" + "="*80)
266
+ logger.info("SUMMARY")
267
+ logger.info("="*80)
268
+
269
+ passed = sum(1 for v in results.values() if v is True)
270
+ failed = sum(1 for v in results.values() if v is False)
271
+ errors = sum(1 for v in results.values() if v is None)
272
+
273
+ logger.info(f"\nChecks passed: {passed}")
274
+ logger.info(f"Checks failed: {failed}")
275
+ logger.info(f"Checks errored: {errors}")
276
+
277
+ for check, result in results.items():
278
+ status = "✓ PASS" if result else ("❌ FAIL" if result is False else "⚠️ ERROR")
279
+ logger.info(f" {check:20s}: {status}")
280
+
281
+ if failed == 0 and errors == 0:
282
+ logger.info("\n🎉 ALL CHECKS PASSED - No data leakage detected!")
283
+ logger.info("Your results are LEGITIMATE!")
284
+ elif failed > 0:
285
+ logger.warning(f"\n⚠️ {failed} checks failed - review your pipeline!")
286
+
287
+ logger.info("\n" + "="*80)
288
+
289
+
290
+ if __name__ == "__main__":
291
+ main()
scripts/testing/test_feature_alignment.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test feature alignment between extractor and models
3
+ """
4
+ import sys
5
+ from pathlib import Path
6
+ import joblib
7
+ import pandas as pd
8
+
9
+ sys.path.append(str(Path(__file__).parent))
10
+ from scripts.feature_extraction.url_features_v2 import URLFeatureExtractorV2
11
+
12
+ def test_feature_alignment():
13
+ """Test that feature extraction produces features in the correct order for models"""
14
+
15
+ # Load models
16
+ models_dir = Path(__file__).parent / 'saved_models'
17
+
18
+ model_files = {
19
+ 'Logistic Regression': 'logistic_regression.joblib',
20
+ 'Random Forest': 'random_forest.joblib',
21
+ 'XGBoost': 'xgboost.joblib'
22
+ }
23
+
24
+ # Load scaler
25
+ scaler_path = models_dir / 'scaler.joblib'
26
+ scaler = None
27
+ if scaler_path.exists():
28
+ scaler = joblib.load(scaler_path)
29
+ print(f"✓ Loaded scaler")
30
+ if hasattr(scaler, 'feature_names_in_'):
31
+ print(f" Scaler has {len(scaler.feature_names_in_)} feature names\n")
32
+
33
+ # Initialize extractor
34
+ extractor = URLFeatureExtractorV2()
35
+
36
+ # Test URL
37
+ test_url = "https://github.com/user/repo"
38
+
39
+ print("Testing feature alignment...\n")
40
+ print(f"Test URL: {test_url}\n")
41
+
42
+ # Extract features
43
+ features_dict = extractor.extract_features(test_url)
44
+ features_df = pd.DataFrame([features_dict])
45
+ if 'label' in features_df.columns:
46
+ features_df = features_df.drop('label', axis=1)
47
+
48
+ print(f"Extracted {len(features_df.columns)} features\n")
49
+
50
+ # Store feature names for fallback
51
+ feature_names_store = {}
52
+
53
+ # Check each model
54
+ for name, filename in model_files.items():
55
+ model_path = models_dir / filename
56
+ if not model_path.exists():
57
+ print(f"❌ {name}: Model file not found")
58
+ continue
59
+
60
+ model = joblib.load(model_path)
61
+
62
+ # Determine expected features
63
+ expected_features = None
64
+ source = None
65
+
66
+ if hasattr(model, 'feature_names_in_'):
67
+ expected_features = list(model.feature_names_in_)
68
+ source = "model"
69
+ elif hasattr(scaler, 'feature_names_in_'):
70
+ expected_features = list(scaler.feature_names_in_)
71
+ source = "scaler"
72
+ elif feature_names_store:
73
+ expected_features = list(feature_names_store.values())[0]
74
+ source = "fallback"
75
+
76
+ if expected_features:
77
+ feature_names_store[name] = expected_features
78
+ print(f"✓ {name}:")
79
+ print(f" Expected features: {len(expected_features)} (from {source})")
80
+ print(f" Expected features: {len(expected_features)} (from {source})")
81
+
82
+ # Check missing features
83
+ missing = set(expected_features) - set(features_df.columns)
84
+ extra = set(features_df.columns) - set(expected_features)
85
+
86
+ if missing:
87
+ print(f" ⚠ Missing features: {len(missing)}")
88
+ print(f" {list(missing)[:5]}...")
89
+
90
+ if extra:
91
+ print(f" ⚠ Extra features: {len(extra)}")
92
+ print(f" {list(extra)[:5]}...")
93
+
94
+ if not missing and not extra:
95
+ print(f" ✓ Perfect match!")
96
+
97
+ # Try prediction with alignment
98
+ features_aligned = pd.DataFrame(columns=expected_features)
99
+ for feat in expected_features:
100
+ if feat in features_df.columns:
101
+ features_aligned[feat] = features_df[feat].values
102
+ else:
103
+ features_aligned[feat] = 0
104
+
105
+ # Scale for Logistic Regression
106
+ if name == 'Logistic Regression' and scaler is not None:
107
+ features_to_use = scaler.transform(features_aligned)
108
+ else:
109
+ features_to_use = features_aligned
110
+
111
+ try:
112
+ pred = model.predict(features_to_use)[0]
113
+ proba = model.predict_proba(features_to_use)[0]
114
+ print(f" ✓ Prediction successful: {'PHISHING' if pred == 1 else 'LEGITIMATE'} ({proba[pred]*100:.1f}%)")
115
+ except Exception as e:
116
+ print(f" ❌ Prediction failed: {e}")
117
+ else:
118
+ print(f"⚠ {name}: No feature names available")
119
+
120
+ print()
121
+
122
+ if __name__ == "__main__":
123
+ test_feature_alignment()
scripts/testing/test_normalization.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test URL normalization - verify www/http variants produce same features
3
+ """
4
+ import sys
5
+ from pathlib import Path
6
+ import pandas as pd
7
+
8
+ sys.path.append(str(Path(__file__).parent))
9
+ from scripts.feature_extraction.url_features_v3 import URLFeatureExtractorOptimized
10
+
11
+ def test_normalization():
12
+ """Test that www/http variants produce identical features."""
13
+
14
+ extractor = URLFeatureExtractorOptimized()
15
+
16
+ print("=" * 80)
17
+ print("URL NORMALIZATION TEST")
18
+ print("=" * 80)
19
+ print()
20
+
21
+ # Test cases - should all have IDENTICAL features (except is_http)
22
+ test_cases = [
23
+ [
24
+ "https://github.com/user/repo",
25
+ "http://github.com/user/repo",
26
+ "https://www.github.com/user/repo",
27
+ "http://www.github.com/user/repo",
28
+ "www.github.com/user/repo",
29
+ "github.com/user/repo"
30
+ ],
31
+ [
32
+ "https://example.com/login?user=test",
33
+ "www.example.com/login?user=test",
34
+ "http://www.example.com/login?user=test"
35
+ ]
36
+ ]
37
+
38
+ for i, urls in enumerate(test_cases, 1):
39
+ print(f"Test Case {i}: {urls[0].split('/')[2]}")
40
+ print("-" * 80)
41
+
42
+ features_list = []
43
+ for url in urls:
44
+ features = extractor.extract_features(url)
45
+ features_list.append(features)
46
+
47
+ # Show normalization
48
+ norm_url, orig, norm_domain, is_http = extractor.normalize_url(url)
49
+ print(f" {url:45s} → {norm_domain:20s} http={is_http}")
50
+
51
+ # Compare key features (should be identical except is_http)
52
+ key_features = [
53
+ 'domain_length', 'domain_dots', 'num_subdomains', 'domain_entropy',
54
+ 'path_length', 'url_entropy', 'is_shortened', 'is_free_platform',
55
+ 'has_suspicious_tld', 'num_phishing_keywords'
56
+ ]
57
+
58
+ print("\n Key Features Comparison:")
59
+ print(" " + "-" * 76)
60
+
61
+ # Check if all features are identical (except www/http)
62
+ first_features = features_list[0]
63
+ all_identical = True
64
+
65
+ for feat in key_features:
66
+ values = [f[feat] for f in features_list]
67
+ unique_vals = set(values)
68
+
69
+ if len(unique_vals) == 1:
70
+ status = "✓"
71
+ else:
72
+ status = "✗"
73
+ all_identical = False
74
+
75
+ print(f" {status} {feat:30s}: {values[0]}")
76
+
77
+ # Check is_http (should vary)
78
+ print("\n HTTP Flag (should vary based on input):")
79
+ print(" " + "-" * 76)
80
+ for j, url in enumerate(urls):
81
+ http_flag = features_list[j]['is_http']
82
+ print(f" {url:45s} → http={http_flag}")
83
+
84
+ print()
85
+ if all_identical:
86
+ print(f" ✅ TEST PASSED: All key features identical!")
87
+ else:
88
+ print(f" ❌ TEST FAILED: Features differ!")
89
+
90
+ print("\n")
91
+
92
+ print("=" * 80)
93
+ print("FEATURE COUNT")
94
+ print("=" * 80)
95
+
96
+ feature_names = extractor.get_feature_names() # pyright: ignore[reportAttributeAccessIssue]
97
+ print(f"Total features: {len(feature_names)}")
98
+ print()
99
+ print("Top 30 features:")
100
+ for i, name in enumerate(feature_names[:30], 1):
101
+ print(f" {i:2d}. {name}")
102
+
103
+
104
+ if __name__ == "__main__":
105
+ test_normalization()
scripts/testing/test_server.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test Server Predictions Against Dataset
3
+ Validates server API predictions vs actual labels
4
+ """
5
+ import pandas as pd
6
+ import requests
7
+ from pathlib import Path
8
+ import logging
9
+ from tqdm import tqdm
10
+ import time
11
+ from sklearn.metrics import (
12
+ accuracy_score, precision_score, recall_score, f1_score,
13
+ confusion_matrix, classification_report
14
+ )
15
+
16
+ # Setup logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(levelname)s - %(message)s',
20
+ datefmt='%H:%M:%S'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ServerTester:
26
+ """Test phishing detection server against dataset"""
27
+
28
+ def __init__(self, server_url='http://localhost:8000', batch_size=100):
29
+ self.server_url = server_url
30
+ self.batch_size = batch_size
31
+ self.results = []
32
+
33
+ def check_server_health(self):
34
+ """Check if server is running"""
35
+ try:
36
+ response = requests.get(f"{self.server_url}/api/health", timeout=5)
37
+ if response.status_code == 200:
38
+ health = response.json()
39
+ logger.info(f"✓ Server is healthy")
40
+ logger.info(f" URL models: {health.get('url_models', 0)}")
41
+ logger.info(f" HTML models: {health.get('html_models', 0)}")
42
+ return True
43
+ else:
44
+ logger.error(f"Server health check failed: {response.status_code}")
45
+ return False
46
+ except Exception as e:
47
+ logger.error(f"Cannot connect to server: {e}")
48
+ logger.error(f"Make sure server is running: python server/app.py")
49
+ return False
50
+
51
+ def predict_url(self, url):
52
+ """Get prediction from server for a URL"""
53
+ try:
54
+ response = requests.post(
55
+ f"{self.server_url}/api/predict/url",
56
+ json={"url": url},
57
+ timeout=10
58
+ )
59
+
60
+ if response.status_code == 200:
61
+ result = response.json()
62
+ return {
63
+ 'predicted': 1 if result['is_phishing'] else 0,
64
+ 'consensus': result['consensus'],
65
+ 'predictions': result['predictions']
66
+ }
67
+ else:
68
+ logger.warning(f"Server error for {url}: {response.status_code}")
69
+ return None
70
+
71
+ except Exception as e:
72
+ logger.warning(f"Request error for {url}: {e}")
73
+ return None
74
+
75
+ def test_dataset(self, dataset_path, limit=None, sample_frac=None):
76
+ """
77
+ Test server predictions against dataset.
78
+
79
+ Args:
80
+ dataset_path: Path to CSV with 'url' and 'label' columns
81
+ limit: Maximum number of URLs to test (None = all)
82
+ sample_frac: Random sample fraction (e.g., 0.1 = 10%)
83
+ """
84
+ logger.info("="*80)
85
+ logger.info("SERVER PREDICTION TESTING")
86
+ logger.info("="*80)
87
+
88
+ # Load dataset
89
+ logger.info(f"\n1. Loading dataset: {dataset_path}")
90
+ df = pd.read_csv(dataset_path)
91
+
92
+ # Ensure we have required columns
93
+ if 'label' not in df.columns:
94
+ # Assume first column is URL, second is label
95
+ df.columns = ['url', 'label']
96
+
97
+ logger.info(f" Total URLs: {len(df):,}")
98
+ logger.info(f" Phishing: {(df['label']==1).sum():,}")
99
+ logger.info(f" Legitimate: {(df['label']==0).sum():,}")
100
+
101
+ # Sample if requested
102
+ if sample_frac:
103
+ df = df.sample(frac=sample_frac, random_state=42)
104
+ logger.info(f"\n Sampled {sample_frac*100:.1f}%: {len(df):,} URLs")
105
+
106
+ # Limit if requested
107
+ if limit and limit < len(df):
108
+ df = df.head(limit)
109
+ logger.info(f" Limited to: {limit:,} URLs")
110
+
111
+ # Check server
112
+ logger.info("\n2. Checking server health...")
113
+ if not self.check_server_health():
114
+ return None
115
+
116
+ # Test predictions
117
+ logger.info("\n3. Testing predictions...")
118
+ y_true = []
119
+ y_pred = []
120
+ errors = 0
121
+
122
+ for idx, row in tqdm(df.iterrows(), total=len(df), desc="Testing URLs"):
123
+ url = row['url'] if 'url' in row else row.iloc[0]
124
+ true_label = int(row['label']) if 'label' in row else int(row.iloc[1])
125
+
126
+ # Get prediction
127
+ result = self.predict_url(url)
128
+
129
+ if result:
130
+ y_true.append(true_label)
131
+ y_pred.append(result['predicted'])
132
+
133
+ self.results.append({
134
+ 'url': url,
135
+ 'true_label': true_label,
136
+ 'predicted_label': result['predicted'],
137
+ 'consensus': result['consensus'],
138
+ 'correct': true_label == result['predicted']
139
+ })
140
+ else:
141
+ errors += 1
142
+
143
+ # Rate limiting
144
+ time.sleep(0.01) # 10ms delay between requests
145
+
146
+ logger.info(f"\n Processed: {len(y_pred):,} URLs")
147
+ if errors > 0:
148
+ logger.warning(f" Errors: {errors:,}")
149
+
150
+ # Calculate metrics
151
+ self._display_results(y_true, y_pred)
152
+
153
+ return {
154
+ 'y_true': y_true,
155
+ 'y_pred': y_pred,
156
+ 'results': self.results
157
+ }
158
+
159
+ def _display_results(self, y_true, y_pred):
160
+ """Display test results and metrics"""
161
+ logger.info("\n" + "="*80)
162
+ logger.info("TEST RESULTS")
163
+ logger.info("="*80)
164
+
165
+ # Calculate metrics
166
+ accuracy = accuracy_score(y_true, y_pred)
167
+ precision = precision_score(y_true, y_pred)
168
+ recall = recall_score(y_true, y_pred)
169
+ f1 = f1_score(y_true, y_pred)
170
+
171
+ logger.info(f"\nOverall Metrics:")
172
+ logger.info(f" Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
173
+ logger.info(f" Precision: {precision:.4f} ({precision*100:.2f}%)")
174
+ logger.info(f" Recall: {recall:.4f} ({recall*100:.2f}%)")
175
+ logger.info(f" F1-Score: {f1:.4f} ({f1*100:.2f}%)")
176
+
177
+ # Confusion matrix
178
+ cm = confusion_matrix(y_true, y_pred)
179
+ tn, fp, fn, tp = cm.ravel()
180
+
181
+ logger.info(f"\nConfusion Matrix:")
182
+ logger.info(f" Predicted")
183
+ logger.info(f" Legit Phish")
184
+ logger.info(f"Actual Legit {tn:6,} {fp:6,}")
185
+ logger.info(f" Phish {fn:6,} {tp:6,}")
186
+
187
+ logger.info(f"\nError Analysis:")
188
+ logger.info(f" True Negatives: {tn:,} (correctly identified legitimate)")
189
+ logger.info(f" True Positives: {tp:,} (correctly identified phishing)")
190
+ logger.info(f" False Positives: {fp:,} ({fp/(tn+fp)*100:.2f}% of legitimate marked as phishing)")
191
+ logger.info(f" False Negatives: {fn:,} ({fn/(tp+fn)*100:.2f}% of phishing marked as legitimate) ⚠️")
192
+
193
+ # Classification report
194
+ logger.info(f"\nDetailed Classification Report:")
195
+ logger.info(classification_report(
196
+ y_true, y_pred,
197
+ target_names=['Legitimate', 'Phishing'],
198
+ digits=4
199
+ ))
200
+
201
+ def save_results(self, output_path):
202
+ """Save test results to CSV"""
203
+ if not self.results:
204
+ logger.warning("No results to save")
205
+ return
206
+
207
+ df = pd.DataFrame(self.results)
208
+ df.to_csv(output_path, index=False)
209
+ logger.info(f"\n✓ Results saved: {output_path}")
210
+ logger.info(f" Total: {len(df):,} predictions")
211
+ logger.info(f" Correct: {df['correct'].sum():,} ({df['correct'].mean()*100:.2f}%)")
212
+ logger.info(f" Incorrect: {(~df['correct']).sum():,}")
213
+
214
+
215
+ def main():
216
+ """Main testing function"""
217
+ # Paths
218
+ dataset_path = Path('data/processed/mega_dataset_full_912357.csv')
219
+ output_path = Path('results/server_test_results.csv')
220
+ output_path.parent.mkdir(parents=True, exist_ok=True)
221
+
222
+ # Check dataset exists
223
+ if not dataset_path.exists():
224
+ logger.error(f"Dataset not found: {dataset_path}")
225
+ logger.info("Available datasets:")
226
+ for csv_file in Path('data/processed').glob('*.csv'):
227
+ logger.info(f" - {csv_file}")
228
+ return
229
+
230
+ # Create tester
231
+ tester = ServerTester(server_url='http://localhost:8000')
232
+
233
+ # Test with sample (10% of dataset for quick test)
234
+ logger.info("\nTesting with 10% sample for quick validation...")
235
+ logger.info("(Use sample_frac=1.0 or remove it to test full dataset)")
236
+
237
+ results = tester.test_dataset(
238
+ dataset_path,
239
+ # sample_frac=0.1 # 0.1 for 10% sample (91k URLs) 1.0 for full dataset
240
+ limit=1000 # Or use limit for exact number
241
+ )
242
+
243
+ if results:
244
+ # Save results
245
+ tester.save_results(output_path)
246
+
247
+ logger.info("\n" + "="*80)
248
+ logger.info("✓ SERVER TESTING COMPLETE!")
249
+ logger.info("="*80)
250
+ logger.info(f"\nResults saved to: {output_path}")
251
+ logger.info("\nTo test full dataset, change sample_frac=1.0")
252
+
253
+
254
+ if __name__ == '__main__':
255
+ main()
scripts/utils/analyze_dataset.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sys
3
+
4
+ # Cesta k datasetu
5
+ dataset_path = 'data/processed/url_dataset_cleaned.csv'
6
+
7
+ try:
8
+ # Načítanie datasetu
9
+ df = pd.read_csv(dataset_path)
10
+
11
+ # Analýza labelov
12
+ print("=" * 50)
13
+ print("ANALÝZA DATASETU")
14
+ print("=" * 50)
15
+ print(f"\nCelkový počet záznamov: {len(df)}")
16
+ print(f"\nRozdělenie labelov:")
17
+ print("-" * 50)
18
+
19
+ label_counts = df['label'].value_counts().sort_index()
20
+
21
+ for label, count in label_counts.items():
22
+ percentage = (count / len(df)) * 100
23
+ print(f"Label {label}: {count} záznamov ({percentage:.2f}%)")
24
+
25
+ print("-" * 50)
26
+ print(f"\nPomer label 0 / label 1: {label_counts.get(0, 0) / label_counts.get(1, 1):.2f}")
27
+
28
+ # Kontrola chýbajúcich hodnôt
29
+ missing = df['label'].isna().sum()
30
+ if missing > 0:
31
+ print(f"\nChýbajúce labely: {missing}")
32
+
33
+ print("\n" + "=" * 50)
34
+
35
+ except FileNotFoundError:
36
+ print(f"Súbor '{dataset_path}' nebol nájdený")
37
+ print(f"Aktuálny adresár: {sys.path[0]}")
38
+ except KeyError:
39
+ print("Stĺpec 'label' neexistuje v datasete")
40
+ print(f"Dostupné stĺpce: {list(df.columns)}") # type: ignore
41
+ except Exception as e:
42
+ print(f"Chyba: {e}")
scripts/utils/balance_dataset.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ input_path = "data/processed/url_dataset_cleaned.csv"
4
+ output_path = "data/processed/url_dataset_balanced.csv"
5
+
6
+ print("Loading dataset...")
7
+ df = pd.read_csv(input_path)
8
+ print(f"Total rows: {len(df):,}")
9
+
10
+ label_counts = df["label"].value_counts()
11
+ print(f"Label 0: {label_counts[0]:,} | Label 1: {label_counts[1]:,}")
12
+
13
+ minority_count = label_counts.min()
14
+ minority_label = label_counts.idxmin()
15
+ majority_label = label_counts.idxmax()
16
+
17
+ print(f"\nBalancing to {minority_count:,} per label (matching label {minority_label})...")
18
+
19
+ df_minority = df[df["label"] == minority_label]
20
+ df_majority = df[df["label"] == majority_label].sample(n=minority_count, random_state=42)
21
+
22
+ df_balanced = pd.concat([df_minority, df_majority]).sample(frac=1, random_state=42).reset_index(drop=True)
23
+
24
+ label_counts_new = df_balanced["label"].value_counts().sort_index()
25
+ print(f"\nBalanced dataset:")
26
+ for label, count in label_counts_new.items():
27
+ print(f" Label {label}: {count:,}")
28
+
29
+ df_balanced.to_csv(output_path, index=False)
30
+ print(f"\nSaved {len(df_balanced):,} rows to {output_path}")
scripts/utils/clean_urls.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sys
3
+ import os
4
+
5
+ def clean_url(url):
6
+ """Remove www. and ensure http(s):// prefix."""
7
+ url = str(url).strip()
8
+
9
+ # Remove www. (handles http://www., https://www., and bare www.)
10
+ if url.startswith("https://www."):
11
+ url = "https://" + url[len("https://www."):]
12
+ elif url.startswith("http://www."):
13
+ url = "http://" + url[len("http://www."):]
14
+ elif url.startswith("www."):
15
+ url = url[len("www."):]
16
+
17
+ # Add http:// if no scheme present
18
+ if not url.startswith("http://") and not url.startswith("https://"):
19
+ url = "http://" + url
20
+
21
+ return url
22
+
23
+
24
+ def main():
25
+ input_path = sys.argv[1] if len(sys.argv) > 1 else "data/raw/top-1m.csv"
26
+
27
+ base, ext = os.path.splitext(input_path)
28
+ output_path = sys.argv[2] if len(sys.argv) > 2 else f"{base}_cleaned{ext}"
29
+
30
+ print(f"Reading {input_path}...")
31
+ df = pd.read_csv(input_path)
32
+ print(f"Loaded {len(df):,} rows")
33
+
34
+ print("Cleaning URLs...")
35
+ df["url"] = df["url"].apply(clean_url)
36
+
37
+ # Drop duplicates that may appear after www. removal
38
+ before = len(df)
39
+ df.drop_duplicates(subset=["url"], keep="first", inplace=True)
40
+ after = len(df)
41
+ if before != after:
42
+ print(f"Removed {before - after:,} duplicates after cleaning")
43
+
44
+ df.to_csv(output_path, index=False)
45
+ print(f"Saved {len(df):,} rows to {output_path}")
46
+
47
+
48
+ if __name__ == "__main__":
49
+ main()
scripts/utils/merge_datasets.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ df1 = pd.read_csv('data/raw/phishing.csv')
4
+ df2 = pd.read_csv('data/raw/legitimate.csv')
5
+
6
+ df2.columns = df2.columns.str.lower()
7
+
8
+ combined_df = pd.concat([df1, df2], ignore_index=True)
9
+
10
+ combined_df = combined_df.drop_duplicates()
11
+
12
+ combined_df.to_csv('data/processed/clean_dataset.csv', index=False)
13
+
14
+ print(f"Datasety boli úspešne spojené")
15
+ print(f"Počet záznamov v prvom súbore: {len(df1)}")
16
+ print(f"Počet záznamov v druhom súbore: {len(df2)}")
17
+ print(f"Celkový počet záznamov: {len(combined_df)}")
18
+ print(f"\nPrvých 5 riadkov spojeného datasetu:")
19
+ print(combined_df.head())
scripts/utils/remove_duplicates.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Remove duplicates from clean_dataset.csv
3
+ """
4
+ import pandas as pd
5
+
6
+ # Load dataset
7
+ df = pd.read_csv('data/processed/clean_dataset.csv')
8
+ print(f"Original: {len(df):,} URLs")
9
+
10
+ # Check duplicates
11
+ print(f"Duplicates: {df.duplicated(subset='url').sum():,}")
12
+
13
+ # Keep first occurrence of each URL
14
+ df_clean = df.drop_duplicates(subset='url', keep='first')
15
+ print(f"After removing duplicates: {len(df_clean):,} URLs")
16
+
17
+ # Check label distribution
18
+ print(f"\nLabel distribution:")
19
+ print(df_clean['label'].value_counts())
20
+
21
+ # Save
22
+ df_clean.to_csv('data/processed/clean_dataset_no_duplicates.csv', index=False)
23
+ print(f"\n✓ Saved to: data/processed/clean_dataset_no_duplicates.csv")
server/__pycache__/app.cpython-313.pyc ADDED
Binary file (40 kB). View file
 
server/app.py ADDED
@@ -0,0 +1,819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phishing Detection API Server
3
+ FastAPI server combining URL and HTML phishing detection
4
+ """
5
+ import os
6
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
7
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Optional
12
+ import warnings
13
+
14
+ # Suppress warnings before importing other libraries
15
+ warnings.filterwarnings('ignore', category=UserWarning)
16
+ warnings.filterwarnings('ignore', message='.*XGBoost.*')
17
+ warnings.filterwarnings('ignore', message='.*Unverified HTTPS.*')
18
+
19
+ from fastapi import FastAPI, HTTPException
20
+ from fastapi.staticfiles import StaticFiles
21
+ from fastapi.responses import HTMLResponse, JSONResponse
22
+ from fastapi.middleware.cors import CORSMiddleware
23
+ from pydantic import BaseModel
24
+ import json
25
+ import joblib
26
+ import pandas as pd
27
+ import numpy as np
28
+ import requests
29
+ from urllib.parse import urlparse
30
+ import logging
31
+ import urllib3
32
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
33
+
34
+ # Add parent directory to path
35
+ sys.path.append(str(Path(__file__).parent.parent))
36
+
37
+ # Use OPTIMIZED URL feature extractor with normalization
38
+ from scripts.feature_extraction.url.url_features_v3 import URLFeatureExtractorOptimized
39
+ from scripts.feature_extraction.html.html_feature_extractor import HTMLFeatureExtractor
40
+ from scripts.feature_extraction.html.feature_engineering import engineer_features
41
+
42
+ # Setup logging
43
+ logging.basicConfig(level=logging.INFO)
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ # Helper function to convert numpy/pandas types to Python native types
48
+ def convert_to_json_serializable(obj):
49
+ """Convert numpy/pandas types to JSON-serializable Python types"""
50
+ if isinstance(obj, dict):
51
+ return {key: convert_to_json_serializable(value) for key, value in obj.items()}
52
+ elif isinstance(obj, list):
53
+ return [convert_to_json_serializable(item) for item in obj]
54
+ elif isinstance(obj, (np.integer, np.int64, np.int32)): # type: ignore
55
+ return int(obj)
56
+ elif isinstance(obj, (np.floating, np.float64, np.float32)): # type: ignore
57
+ return float(obj)
58
+ elif isinstance(obj, np.ndarray):
59
+ return convert_to_json_serializable(obj.tolist())
60
+ elif isinstance(obj, (pd.Series, pd.DataFrame)):
61
+ return convert_to_json_serializable(obj.to_dict())
62
+ elif isinstance(obj, np.bool_):
63
+ return bool(obj)
64
+ else:
65
+ return obj
66
+
67
+
68
+ # Initialize FastAPI app
69
+ app = FastAPI(
70
+ title="Phishing Detection API",
71
+ description="API for detecting phishing URLs and HTML content",
72
+ version="1.0.0"
73
+ )
74
+
75
+ # CORS middleware
76
+ app.add_middleware(
77
+ CORSMiddleware,
78
+ allow_origins=["*"],
79
+ allow_credentials=True,
80
+ allow_methods=["*"],
81
+ allow_headers=["*"],
82
+ )
83
+
84
+ # Mount static files
85
+ static_dir = Path(__file__).parent / 'static'
86
+ app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
87
+
88
+
89
+ # Request models
90
+ class URLRequest(BaseModel):
91
+ url: str
92
+
93
+ class HTMLRequest(BaseModel):
94
+ html_content: str
95
+ url: Optional[str] = None
96
+
97
+
98
+ # Response models
99
+ class PredictionResult(BaseModel):
100
+ model_name: str
101
+ prediction: str
102
+ confidence: float
103
+ phishing_probability: float
104
+ legitimate_probability: float
105
+
106
+
107
+ class URLPredictionResponse(BaseModel):
108
+ url: str
109
+ is_phishing: bool
110
+ consensus: str
111
+ predictions: list[PredictionResult]
112
+ features: dict
113
+
114
+
115
+ class HTMLPredictionResponse(BaseModel):
116
+ source: str
117
+ is_phishing: bool
118
+ consensus: str
119
+ predictions: list[PredictionResult]
120
+ features: dict
121
+
122
+
123
+ class PhishingDetectorService:
124
+ """Singleton service for phishing detection with pre-loaded models."""
125
+
126
+ _instance = None
127
+ _initialized = False
128
+
129
+ TRUSTED_DOMAINS = frozenset({
130
+ 'youtube.com', 'facebook.com', 'twitter.com', 'x.com',
131
+ 'linkedin.com', 'microsoft.com', 'apple.com', 'amazon.com',
132
+ 'github.com', 'gitlab.com', 'stackoverflow.com',
133
+ 'claude.ai', 'anthropic.com', 'openai.com', 'chatgpt.com',
134
+ 'wikipedia.org', 'reddit.com', 'instagram.com', 'whatsapp.com',
135
+ })
136
+
137
+ DEFAULT_THRESHOLD = 0.5
138
+
139
+ HTML_DOWNLOAD_HEADERS = {
140
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
141
+ }
142
+
143
+ def __new__(cls):
144
+ if cls._instance is None:
145
+ cls._instance = super().__new__(cls)
146
+ return cls._instance
147
+
148
+ def __init__(self):
149
+ if self._initialized:
150
+ return
151
+
152
+ logger.info("Initializing Phishing Detector Service...")
153
+
154
+ self.models_dir = Path(__file__).parent.parent / 'saved_models'
155
+
156
+ # Initialize extractors
157
+ self.url_extractor = URLFeatureExtractorOptimized()
158
+ self.html_extractor = HTMLFeatureExtractor()
159
+
160
+ # Load models
161
+ self.url_models = {}
162
+ self.url_feature_names = {}
163
+ self.scaler = None
164
+ self._load_url_models()
165
+
166
+ self.html_models = {}
167
+ self._load_html_models()
168
+
169
+ self.combined_models = {}
170
+ self._load_combined_models()
171
+
172
+ # CNN models
173
+ self.cnn_url_model = None
174
+ self.cnn_url_vocab = None
175
+ self.cnn_html_model = None
176
+ self.cnn_html_vocab = None
177
+ self._load_cnn_url_model()
178
+ self._load_cnn_html_model()
179
+
180
+ self._initialized = True
181
+ logger.info("✓ Service initialized successfully")
182
+
183
+ def _load_url_models(self):
184
+ """Load URL prediction models"""
185
+ # Load scaler
186
+ scaler_path = self.models_dir / 'scaler.joblib'
187
+ if scaler_path.exists():
188
+ self.scaler = joblib.load(scaler_path)
189
+ logger.info("✓ Loaded scaler for URL models")
190
+
191
+ # Load models
192
+ url_model_files = {
193
+ 'Logistic Regression': 'logistic_regression.joblib',
194
+ 'Random Forest': 'random_forest.joblib',
195
+ 'XGBoost': 'xgboost.joblib'
196
+ }
197
+
198
+ for name, filename in url_model_files.items():
199
+ model_path = self.models_dir / filename
200
+ if model_path.exists():
201
+ model = joblib.load(model_path)
202
+ self.url_models[name] = model
203
+
204
+ # Store expected feature names from model
205
+ if hasattr(model, 'feature_names_in_'):
206
+ self.url_feature_names[name] = list(model.feature_names_in_)
207
+ logger.info(f"✓ Loaded URL model: {name} ({len(self.url_feature_names[name])} features)")
208
+ elif self.scaler and hasattr(self.scaler, 'feature_names_in_'):
209
+ # Use scaler's feature names for models without them (like Logistic Regression)
210
+ self.url_feature_names[name] = list(self.scaler.feature_names_in_)
211
+ logger.info(f"✓ Loaded URL model: {name} (using scaler features: {len(self.url_feature_names[name])} features)")
212
+ else:
213
+ logger.info(f"✓ Loaded URL model: {name}")
214
+
215
+ def _load_html_models(self):
216
+ """Load HTML prediction models."""
217
+ html_model_files = {
218
+ 'Random Forest': ('random_forest_html.joblib', 'random_forest_html_feature_names.joblib'),
219
+ 'XGBoost': ('xgboost_html.joblib', 'xgboost_html_feature_names.joblib'),
220
+ }
221
+
222
+ for name, (model_file, features_file) in html_model_files.items():
223
+ model_path = self.models_dir / model_file
224
+ features_path = self.models_dir / features_file
225
+ if model_path.exists():
226
+ self.html_models[name] = {
227
+ 'model': joblib.load(model_path),
228
+ 'features': joblib.load(features_path) if features_path.exists() else None,
229
+ }
230
+ logger.info(f"✓ Loaded HTML model: {name}")
231
+
232
+ def _load_combined_models(self):
233
+ """Load combined URL+HTML prediction models."""
234
+ combined_model_files = {
235
+ 'Random Forest Combined': ('random_forest_combined.joblib', 'random_forest_combined_feature_names.joblib'),
236
+ 'XGBoost Combined': ('xgboost_combined.joblib', 'xgboost_combined_feature_names.joblib'),
237
+ }
238
+
239
+ for name, (model_file, features_file) in combined_model_files.items():
240
+ model_path = self.models_dir / model_file
241
+ features_path = self.models_dir / features_file
242
+ if model_path.exists():
243
+ self.combined_models[name] = {
244
+ 'model': joblib.load(model_path),
245
+ 'features': joblib.load(features_path) if features_path.exists() else None,
246
+ }
247
+ n = len(self.combined_models[name]['features']) if self.combined_models[name]['features'] else '?'
248
+ logger.info(f"✓ Loaded combined model: {name} ({n} features)")
249
+
250
+ def _load_cnn_url_model(self):
251
+ """Load character-level CNN URL model and vocabulary."""
252
+ model_path = self.models_dir / 'cnn_url_model.keras'
253
+ vocab_path = self.models_dir / 'cnn_url_vocab.json'
254
+
255
+ if not model_path.exists():
256
+ logger.warning(f"✗ CNN URL model not found: {model_path}")
257
+ return
258
+ if not vocab_path.exists():
259
+ logger.warning(f"✗ CNN URL vocabulary not found: {vocab_path}")
260
+ return
261
+
262
+ try:
263
+ import tensorflow as tf
264
+ self.cnn_url_model = tf.keras.models.load_model(str(model_path))
265
+
266
+ with open(vocab_path, 'r') as f:
267
+ self.cnn_url_vocab = json.load(f)
268
+
269
+ logger.info(f"✓ Loaded CNN URL model (vocab_size={self.cnn_url_vocab['vocab_size']}, max_len={self.cnn_url_vocab['max_len']})")
270
+ except Exception as e:
271
+ logger.warning(f"✗ Failed to load CNN URL model: {e}")
272
+ self.cnn_url_model = None
273
+ self.cnn_url_vocab = None
274
+
275
+ def _load_cnn_html_model(self):
276
+ """Load character-level CNN HTML model and vocabulary."""
277
+ model_path = self.models_dir / 'cnn_html_model.keras'
278
+ vocab_path = self.models_dir / 'cnn_html_vocab.json'
279
+
280
+ if not model_path.exists():
281
+ logger.warning(f"✗ CNN HTML model not found: {model_path}")
282
+ return
283
+ if not vocab_path.exists():
284
+ logger.warning(f"✗ CNN HTML vocabulary not found: {vocab_path}")
285
+ return
286
+
287
+ try:
288
+ import tensorflow as tf
289
+ self.cnn_html_model = tf.keras.models.load_model(str(model_path))
290
+
291
+ with open(vocab_path, 'r') as f:
292
+ self.cnn_html_vocab = json.load(f)
293
+
294
+ logger.info(f"✓ Loaded CNN HTML model (vocab_size={self.cnn_html_vocab['vocab_size']}, max_len={self.cnn_html_vocab['max_len']})")
295
+ except Exception as e:
296
+ logger.warning(f"✗ Failed to load CNN HTML model: {e}")
297
+ self.cnn_html_model = None
298
+ self.cnn_html_vocab = None
299
+
300
+ def _encode_for_cnn(self, text: str, vocab: dict) -> np.ndarray:
301
+ """Encode text to a padded integer sequence for a CNN model."""
302
+ char_to_idx = vocab['char_to_idx']
303
+ max_len = vocab['max_len']
304
+ PAD_IDX = 0
305
+ UNK_IDX = 1
306
+ encoded = [char_to_idx.get(c, UNK_IDX) for c in text[:max_len]]
307
+ encoded += [PAD_IDX] * (max_len - len(encoded))
308
+ return np.array([encoded], dtype=np.int32)
309
+
310
+ # ── Shared helpers ─────────────────────────────────────────────
311
+
312
+ @staticmethod
313
+ def _calculate_consensus(predictions: list[dict]) -> tuple[bool, str]:
314
+ """Return (is_phishing, consensus_text) from a list of prediction dicts."""
315
+ total = len(predictions)
316
+ phishing_votes = sum(1 for p in predictions if p['prediction'] == 'PHISHING')
317
+ is_phishing = phishing_votes > total / 2
318
+
319
+ if phishing_votes == total:
320
+ consensus = "ALL MODELS AGREE: PHISHING"
321
+ elif phishing_votes == 0:
322
+ consensus = "ALL MODELS AGREE: LEGITIMATE"
323
+ else:
324
+ consensus = f"MIXED: {phishing_votes}/{total} models say PHISHING"
325
+
326
+ return is_phishing, consensus
327
+
328
+ def _align_features(self, features_df: pd.DataFrame, model_name: str) -> np.ndarray:
329
+ """Align extracted features to a model's expected feature order."""
330
+ expected = self.url_feature_names.get(model_name)
331
+ if expected is None and self.url_feature_names:
332
+ expected = next(iter(self.url_feature_names.values()))
333
+
334
+ if expected is not None:
335
+ aligned = pd.DataFrame(columns=expected)
336
+ for feat in expected:
337
+ aligned[feat] = features_df[feat].values if feat in features_df.columns else 0
338
+ return aligned.values
339
+
340
+ return features_df.values
341
+
342
+ @staticmethod
343
+ def _build_prediction(model_name: str, model, features: np.ndarray, threshold: float = 0.5) -> dict:
344
+ """Run a single model and return a standardised prediction dict."""
345
+ if hasattr(model, 'predict_proba'):
346
+ probabilities = model.predict_proba(features)[0]
347
+ pred = 1 if probabilities[1] > threshold else 0
348
+ confidence = probabilities[pred] * 100
349
+ phishing_prob = probabilities[1] * 100
350
+ legitimate_prob = probabilities[0] * 100
351
+ else:
352
+ pred = model.predict(features)[0]
353
+ confidence = 100.0
354
+ phishing_prob = 100.0 if pred == 1 else 0.0
355
+ legitimate_prob = 0.0 if pred == 1 else 100.0
356
+
357
+ return {
358
+ 'model_name': model_name,
359
+ 'prediction': 'PHISHING' if pred == 1 else 'LEGITIMATE',
360
+ 'confidence': confidence,
361
+ 'phishing_probability': phishing_prob,
362
+ 'legitimate_probability': legitimate_prob,
363
+ }
364
+
365
+ @staticmethod
366
+ def _whitelisted_prediction(model_name: str) -> dict:
367
+ """Return a pre-built LEGITIMATE prediction for whitelisted domains."""
368
+ return {
369
+ 'model_name': model_name,
370
+ 'prediction': 'LEGITIMATE',
371
+ 'confidence': 99.99,
372
+ 'phishing_probability': 0.01,
373
+ 'legitimate_probability': 99.99,
374
+ }
375
+
376
+ # ── URL prediction ────────────────────────────────────────────
377
+
378
+ def predict_url(self, url: str) -> dict:
379
+ """Predict if a URL is phishing using all URL models."""
380
+ parsed = urlparse(url)
381
+ domain = parsed.netloc.lower().replace('www.', '')
382
+ is_whitelisted = any(domain.endswith(d) for d in self.TRUSTED_DOMAINS)
383
+
384
+ # Extract features
385
+ features_dict = self.url_extractor.extract_features(url)
386
+ features_df = pd.DataFrame([features_dict]).drop(columns=['label'], errors='ignore')
387
+
388
+ # Get predictions from each URL model
389
+ predictions = []
390
+ for model_name, model in self.url_models.items():
391
+ if is_whitelisted:
392
+ predictions.append(self._whitelisted_prediction(model_name))
393
+ continue
394
+
395
+ aligned = self._align_features(features_df, model_name)
396
+ if model_name == 'Logistic Regression' and self.scaler:
397
+ aligned = self.scaler.transform(aligned)
398
+
399
+ predictions.append(
400
+ self._build_prediction(model_name, model, aligned, self.DEFAULT_THRESHOLD)
401
+ )
402
+
403
+ is_phishing, consensus = self._calculate_consensus(predictions)
404
+
405
+ return {
406
+ 'url': url,
407
+ 'is_phishing': is_phishing,
408
+ 'consensus': consensus,
409
+ 'predictions': predictions,
410
+ 'features': features_dict,
411
+ }
412
+
413
+ # ── HTML prediction ───────────────────────────────────────────
414
+
415
+ def predict_html(self, html_content: str, source: str = "") -> dict:
416
+ """Predict if HTML content is phishing using all HTML models."""
417
+ features = self.html_extractor.extract_features(html_content)
418
+ engineered_df = engineer_features(pd.DataFrame([features]))
419
+
420
+ predictions = []
421
+ for model_name, model_data in self.html_models.items():
422
+ model = model_data['model']
423
+ feature_names = model_data['features']
424
+
425
+ if feature_names:
426
+ feature_list = list(feature_names)
427
+ feature_values = [
428
+ engineered_df[f].iloc[0] if f in engineered_df.columns else features.get(f, 0)
429
+ for f in feature_list
430
+ ]
431
+ X = np.array([feature_values])
432
+ else:
433
+ X = engineered_df.values
434
+
435
+ predictions.append(self._build_prediction(model_name, model, X))
436
+
437
+ is_phishing, consensus = self._calculate_consensus(predictions)
438
+
439
+ return {
440
+ 'source': source or 'HTML Content',
441
+ 'is_phishing': is_phishing,
442
+ 'consensus': consensus,
443
+ 'predictions': predictions,
444
+ 'features': features,
445
+ }
446
+
447
+ # ── Full scan (URL + HTML) ─────────────────────────────────────
448
+
449
+ def predict_from_url(self, url: str) -> dict:
450
+ """Download HTML from URL and analyse both URL and HTML."""
451
+ url_result = self.predict_url(url)
452
+
453
+ try:
454
+ resp = requests.get(url, timeout=10, verify=False, headers=self.HTML_DOWNLOAD_HEADERS)
455
+ html_result = self.predict_html(resp.text, source=url)
456
+
457
+ all_predictions = url_result['predictions'] + html_result['predictions']
458
+ is_phishing, consensus = self._calculate_consensus(all_predictions)
459
+
460
+ return {
461
+ 'url': url,
462
+ 'is_phishing': is_phishing,
463
+ 'url_analysis': url_result,
464
+ 'html_analysis': html_result,
465
+ 'combined_consensus': consensus,
466
+ }
467
+ except Exception as e:
468
+ logger.warning(f"Could not download HTML: {e}")
469
+ return {
470
+ 'url': url,
471
+ 'is_phishing': url_result['is_phishing'],
472
+ 'url_analysis': url_result,
473
+ 'html_analysis': None,
474
+ 'error': str(e),
475
+ }
476
+
477
+
478
+ # ── CNN prediction ─────────────────────────────────────────────
479
+
480
+ def predict_cnn(self, url: str, html_content: str | None = None) -> dict:
481
+ """Predict using both character-level CNN models (URL + HTML)."""
482
+ parsed = urlparse(url)
483
+ domain = parsed.netloc.lower().replace('www.', '')
484
+ is_whitelisted = any(domain.endswith(d) for d in self.TRUSTED_DOMAINS)
485
+
486
+ predictions = []
487
+
488
+ # CNN URL model
489
+ if self.cnn_url_model is not None and self.cnn_url_vocab is not None:
490
+ if is_whitelisted:
491
+ predictions.append(self._whitelisted_prediction('CNN URL (Char-level)'))
492
+ else:
493
+ X = self._encode_for_cnn(url, self.cnn_url_vocab)
494
+ phishing_prob = float(self.cnn_url_model.predict(X, verbose=0)[0][0])
495
+ legitimate_prob = 1.0 - phishing_prob
496
+ is_phishing_pred = phishing_prob >= self.DEFAULT_THRESHOLD
497
+ confidence = (phishing_prob if is_phishing_pred else legitimate_prob) * 100
498
+ predictions.append({
499
+ 'model_name': 'CNN URL (Char-level)',
500
+ 'prediction': 'PHISHING' if is_phishing_pred else 'LEGITIMATE',
501
+ 'confidence': confidence,
502
+ 'phishing_probability': phishing_prob * 100,
503
+ 'legitimate_probability': legitimate_prob * 100,
504
+ })
505
+
506
+ # CNN HTML model
507
+ if self.cnn_html_model is not None and self.cnn_html_vocab is not None and html_content:
508
+ if is_whitelisted:
509
+ predictions.append(self._whitelisted_prediction('CNN HTML (Char-level)'))
510
+ else:
511
+ X = self._encode_for_cnn(html_content, self.cnn_html_vocab)
512
+ phishing_prob = float(self.cnn_html_model.predict(X, verbose=0)[0][0])
513
+ legitimate_prob = 1.0 - phishing_prob
514
+ is_phishing_pred = phishing_prob >= self.DEFAULT_THRESHOLD
515
+ confidence = (phishing_prob if is_phishing_pred else legitimate_prob) * 100
516
+ predictions.append({
517
+ 'model_name': 'CNN HTML (Char-level)',
518
+ 'prediction': 'PHISHING' if is_phishing_pred else 'LEGITIMATE',
519
+ 'confidence': confidence,
520
+ 'phishing_probability': phishing_prob * 100,
521
+ 'legitimate_probability': legitimate_prob * 100,
522
+ })
523
+
524
+ if not predictions:
525
+ raise RuntimeError("No CNN models are loaded")
526
+
527
+ is_phishing, consensus = self._calculate_consensus(predictions)
528
+
529
+ return {
530
+ 'url': url,
531
+ 'is_phishing': is_phishing,
532
+ 'consensus': consensus,
533
+ 'predictions': predictions,
534
+ 'features': {},
535
+ }
536
+
537
+ # ── Combined prediction ────────────────────────────────────────
538
+
539
+ def predict_combined(self, url: str) -> dict:
540
+ """Predict using combined URL+HTML models (single ensemble)."""
541
+ if not self.combined_models:
542
+ raise RuntimeError("No combined models loaded")
543
+
544
+ parsed = urlparse(url)
545
+ domain = parsed.netloc.lower().replace('www.', '')
546
+ is_whitelisted = any(domain.endswith(d) for d in self.TRUSTED_DOMAINS)
547
+
548
+ # Extract URL features
549
+ url_features = self.url_extractor.extract_features(url)
550
+ url_df = pd.DataFrame([url_features]).drop(columns=['label'], errors='ignore')
551
+ url_df = url_df.rename(columns={c: f'url_{c}' for c in url_df.columns})
552
+
553
+ # Download + extract HTML features
554
+ html_features = {}
555
+ html_error = None
556
+ eng_df = pd.DataFrame()
557
+ try:
558
+ resp = requests.get(url, timeout=10, verify=False, headers=self.HTML_DOWNLOAD_HEADERS)
559
+ html_features = self.html_extractor.extract_features(resp.text)
560
+ raw_df = pd.DataFrame([html_features])
561
+ eng_df = engineer_features(raw_df)
562
+ eng_df = eng_df.rename(columns={c: f'html_{c}' for c in eng_df.columns})
563
+ except Exception as e:
564
+ html_error = str(e)
565
+ logger.warning(f"Combined: could not download HTML: {e}")
566
+
567
+ # Combine features
568
+ combined_df = pd.concat([url_df, eng_df], axis=1)
569
+
570
+ # Predict
571
+ predictions = []
572
+ for model_name, model_data in self.combined_models.items():
573
+ if is_whitelisted:
574
+ predictions.append(self._whitelisted_prediction(model_name))
575
+ continue
576
+
577
+ model = model_data['model']
578
+ expected = model_data['features']
579
+
580
+ if expected:
581
+ feature_list = list(expected)
582
+ aligned = pd.DataFrame(columns=feature_list)
583
+ for f in feature_list:
584
+ aligned[f] = combined_df[f].values if f in combined_df.columns else 0
585
+ X = aligned.values
586
+ else:
587
+ X = combined_df.values
588
+
589
+ predictions.append(
590
+ self._build_prediction(model_name, model, X, self.DEFAULT_THRESHOLD)
591
+ )
592
+
593
+ is_phishing, consensus = self._calculate_consensus(predictions)
594
+
595
+ return {
596
+ 'url': url,
597
+ 'is_phishing': is_phishing,
598
+ 'consensus': consensus,
599
+ 'predictions': predictions,
600
+ 'url_features': url_features,
601
+ 'html_features': html_features,
602
+ 'html_error': html_error,
603
+ }
604
+
605
+ # ── Unified all-models prediction ──────────────────────────────
606
+
607
+ def predict_all(self, url: str) -> dict:
608
+ """Run ALL models on a URL and return categorised results."""
609
+ parsed = urlparse(url)
610
+ domain = parsed.netloc.lower().replace('www.', '')
611
+ is_whitelisted = any(domain.endswith(d) for d in self.TRUSTED_DOMAINS)
612
+
613
+ # ── 1. URL feature-based models ───────────────────────────
614
+ url_result = self.predict_url(url)
615
+
616
+ # ── 2. Download HTML (shared across HTML/combined/CNN-HTML) ─
617
+ html_content = None
618
+ html_error = None
619
+ try:
620
+ resp = requests.get(url, timeout=10, verify=False, headers=self.HTML_DOWNLOAD_HEADERS)
621
+ html_content = resp.text
622
+ except Exception as e:
623
+ html_error = str(e)
624
+ logger.warning(f"predict_all: could not download HTML: {e}")
625
+
626
+ # ── 3. HTML feature-based models ─────────────────────────
627
+ html_result = None
628
+ if html_content and self.html_models:
629
+ html_result = self.predict_html(html_content, source=url)
630
+
631
+ # ── 4. Combined URL+HTML feature-based models ────────────
632
+ combined_result = None
633
+ if self.combined_models:
634
+ try:
635
+ combined_result = self._predict_combined_with_html(url, html_content, is_whitelisted)
636
+ except Exception as e:
637
+ logger.warning(f"predict_all: combined prediction failed: {e}")
638
+
639
+ # ── 5. CNN models (URL + HTML) ───────────────────────────
640
+ cnn_result = None
641
+ if self.cnn_url_model is not None or self.cnn_html_model is not None:
642
+ try:
643
+ cnn_result = self.predict_cnn(url, html_content)
644
+ except Exception as e:
645
+ logger.warning(f"predict_all: CNN prediction failed: {e}")
646
+
647
+ # ── Aggregate consensus ──────────────────────────────────
648
+ all_predictions = []
649
+ if url_result:
650
+ all_predictions.extend(url_result.get('predictions', []))
651
+ if html_result:
652
+ all_predictions.extend(html_result.get('predictions', []))
653
+ if combined_result:
654
+ all_predictions.extend(combined_result.get('predictions', []))
655
+ if cnn_result:
656
+ all_predictions.extend(cnn_result.get('predictions', []))
657
+
658
+ is_phishing, consensus = self._calculate_consensus(all_predictions) if all_predictions else (False, "No models available")
659
+
660
+ return {
661
+ 'url': url,
662
+ 'is_phishing': is_phishing,
663
+ 'overall_consensus': consensus,
664
+ 'url_models': url_result,
665
+ 'html_models': html_result,
666
+ 'combined_models': combined_result,
667
+ 'cnn_models': cnn_result,
668
+ 'html_error': html_error,
669
+ }
670
+
671
+ def _predict_combined_with_html(self, url: str, html_content: str | None, is_whitelisted: bool) -> dict:
672
+ """Predict using combined models, optionally with pre-fetched HTML."""
673
+ # Extract URL features
674
+ url_features = self.url_extractor.extract_features(url)
675
+ url_df = pd.DataFrame([url_features]).drop(columns=['label'], errors='ignore')
676
+ url_df = url_df.rename(columns={c: f'url_{c}' for c in url_df.columns})
677
+
678
+ # HTML features
679
+ html_features = {}
680
+ html_error = None
681
+ eng_df = pd.DataFrame()
682
+ if html_content:
683
+ try:
684
+ html_features = self.html_extractor.extract_features(html_content)
685
+ raw_df = pd.DataFrame([html_features])
686
+ eng_df = engineer_features(raw_df)
687
+ eng_df = eng_df.rename(columns={c: f'html_{c}' for c in eng_df.columns})
688
+ except Exception as e:
689
+ html_error = str(e)
690
+
691
+ # Combine
692
+ combined_df = pd.concat([url_df, eng_df], axis=1)
693
+
694
+ # Predict
695
+ predictions = []
696
+ for model_name, model_data in self.combined_models.items():
697
+ if is_whitelisted:
698
+ predictions.append(self._whitelisted_prediction(model_name))
699
+ continue
700
+
701
+ model = model_data['model']
702
+ expected = model_data['features']
703
+
704
+ if expected:
705
+ feature_list = list(expected)
706
+ aligned = pd.DataFrame(columns=feature_list)
707
+ for f in feature_list:
708
+ aligned[f] = combined_df[f].values if f in combined_df.columns else 0
709
+ X = aligned.values
710
+ else:
711
+ X = combined_df.values
712
+
713
+ predictions.append(
714
+ self._build_prediction(model_name, model, X, self.DEFAULT_THRESHOLD)
715
+ )
716
+
717
+ is_phishing, consensus_text = self._calculate_consensus(predictions)
718
+
719
+ return {
720
+ 'url': url,
721
+ 'is_phishing': is_phishing,
722
+ 'consensus': consensus_text,
723
+ 'predictions': predictions,
724
+ 'url_features': url_features,
725
+ 'html_features': html_features,
726
+ 'html_error': html_error,
727
+ }
728
+
729
+
730
+ # Initialize service (singleton)
731
+ detector = PhishingDetectorService()
732
+
733
+
734
+ # ── Helpers ───────────────────────────────────────────────────────
735
+
736
+ def _serve_static_html(filename: str, cache: bool = False) -> HTMLResponse:
737
+ """Return an HTMLResponse for a file inside static/, or 404."""
738
+ path = Path(__file__).parent / 'static' / filename
739
+ if not path.exists():
740
+ return HTMLResponse(content="<h1>Page not found</h1>", status_code=404)
741
+ headers = {"Cache-Control": "public, max-age=86400"} if cache else None
742
+ return HTMLResponse(content=path.read_text(encoding='utf-8'), headers=headers)
743
+
744
+
745
+ # ── API Endpoints ─────────────────────────────────────────────────
746
+
747
+ @app.get("/", response_class=HTMLResponse)
748
+ async def root():
749
+ """Serve the main web interface."""
750
+ return _serve_static_html('index.html')
751
+
752
+
753
+ @app.get("/models", response_class=HTMLResponse)
754
+ async def models_page():
755
+ """Serve the model details page."""
756
+ return _serve_static_html('models.html', cache=True)
757
+
758
+
759
+ async def _safe_predict(label: str, fn, *args) -> JSONResponse:
760
+ """Run a prediction function with uniform error handling."""
761
+ try:
762
+ return JSONResponse(content=convert_to_json_serializable(fn(*args)))
763
+ except Exception as e:
764
+ logger.error(f"Error in {label}: {e}")
765
+ raise HTTPException(status_code=500, detail=str(e))
766
+
767
+
768
+ @app.post("/api/predict/url", response_model=URLPredictionResponse)
769
+ async def predict_url(request: URLRequest):
770
+ """Predict if URL is phishing."""
771
+ return await _safe_predict("predict_url", detector.predict_url, request.url)
772
+
773
+
774
+ @app.post("/api/predict/html")
775
+ async def predict_html(request: HTMLRequest):
776
+ """Predict if HTML content is phishing."""
777
+ return await _safe_predict("predict_html", detector.predict_html, request.html_content, request.url or "")
778
+
779
+
780
+ @app.post("/api/predict/full")
781
+ async def predict_full(request: URLRequest):
782
+ """Analyse URL and download HTML for complete analysis."""
783
+ return await _safe_predict("predict_full", detector.predict_from_url, request.url)
784
+
785
+
786
+ @app.post("/api/predict/combined")
787
+ async def predict_combined(request: URLRequest):
788
+ """Predict using combined URL+HTML model."""
789
+ return await _safe_predict("predict_combined", detector.predict_combined, request.url)
790
+
791
+
792
+ @app.post("/api/predict/cnn")
793
+ async def predict_cnn(request: URLRequest):
794
+ """Predict using character-level CNN models."""
795
+ return await _safe_predict("predict_cnn", detector.predict_cnn, request.url, None)
796
+
797
+
798
+ @app.post("/api/predict/all")
799
+ async def predict_all(request: URLRequest):
800
+ """Run ALL models on a URL — unified endpoint."""
801
+ return await _safe_predict("predict_all", detector.predict_all, request.url)
802
+
803
+
804
+ @app.get("/api/health")
805
+ async def health():
806
+ """Health check endpoint"""
807
+ return {
808
+ "status": "healthy",
809
+ "url_models": len(detector.url_models),
810
+ "html_models": len(detector.html_models),
811
+ "combined_models": len(detector.combined_models),
812
+ "cnn_url_model": detector.cnn_url_model is not None,
813
+ "cnn_html_model": detector.cnn_html_model is not None,
814
+ }
815
+
816
+
817
+ if __name__ == "__main__":
818
+ import uvicorn
819
+ uvicorn.run(app, host="0.0.0.0", port=7860)
server/static/index.html ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>PHISHING DETECTION</title>
7
+ <link rel="stylesheet" href="/static/style.css">
8
+ <link rel="prefetch" href="/models">
9
+ </head>
10
+ <body>
11
+ <div class="container">
12
+ <header>
13
+ <div class="logo">Phishing Detection System</div>
14
+ <div class="tagline">Multi-Model URL + HTML Analysis</div>
15
+ </header>
16
+
17
+ <section class="input-section">
18
+ <label class="input-label" for="urlInput">Enter URL to analyze</label>
19
+ <div class="input-wrapper">
20
+ <input
21
+ type="text"
22
+ id="urlInput"
23
+ placeholder="https://example.com"
24
+ value="https://github.com"
25
+ />
26
+ <button class="btn" onclick="analyzeAll()">Analyze</button>
27
+ </div>
28
+ <div class="btn-group">
29
+ <button class="btn btn-secondary" onclick="clearResults()">Clear</button>
30
+ </div>
31
+ </section>
32
+
33
+ <div class="loading" id="loading">
34
+ <div class="loading-bar"></div>
35
+ <div class="loading-text">Analyzing with all models</div>
36
+ </div>
37
+
38
+ <div class="results" id="results">
39
+ <!-- Results injected here -->
40
+ </div>
41
+
42
+ <footer>
43
+ <div class="footer-text">Machine Learning Phishing Detection</div>
44
+ <a href="/models" class="learn-more-btn">Learn More</a>
45
+ </footer>
46
+ </div>
47
+
48
+ <script src="/static/script.js?v=4"></script>
49
+ </body>
50
+ </html>
server/static/models.html ADDED
@@ -0,0 +1,1130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Model Details — Phishing Detection</title>
7
+ <link rel="stylesheet" href="/static/style.css">
8
+ </head>
9
+ <body class="models-page">
10
+ <div class="container">
11
+ <header>
12
+ <div class="header-left">
13
+ <div class="logo"><a href="/">Phishing Detection System</a></div>
14
+ <div class="tagline">Model Performance Details</div>
15
+ </div>
16
+ <a href="/" class="back-link">&larr; Back</a>
17
+ </header>
18
+
19
+ <section class="page-title-section">
20
+ <h1 class="page-title">Model Details</h1>
21
+ <p class="page-description">
22
+ Performance metrics, feature importance, and configuration details for all 9 machine learning models
23
+ used in the phishing detection pipeline. Models span URL features (125), RAW HTML features (77) + engineered (23),
24
+ combined features (225), and character-level CNN approaches.
25
+ </p>
26
+ </section>
27
+
28
+ <!-- DETECTION PIPELINE -->
29
+ <section class="section">
30
+ <div class="section-title">Detection Pipeline</div>
31
+ <div class="pipeline">
32
+ <div class="pipeline-step"><span class="step-number">1</span>URL Input</div>
33
+ <div class="pipeline-step"><span class="step-number">2</span>Feature Extraction</div>
34
+ <div class="pipeline-step"><span class="step-number">3</span>3 URL Models</div>
35
+ <div class="pipeline-step"><span class="step-number">4</span>HTML Download</div>
36
+ <div class="pipeline-step"><span class="step-number">5</span>2 HTML + 2 Combined</div>
37
+ <div class="pipeline-step"><span class="step-number">6</span>2 CNN Models</div>
38
+ <div class="pipeline-step"><span class="step-number">7</span>9-Model Consensus</div>
39
+ </div>
40
+ </section>
41
+
42
+ <!-- URL FEATURES -->
43
+ <section class="section">
44
+ <div class="section-title collapsible-toggle" onclick="toggleFeatures(this)">
45
+ URL Features <span class="feature-count">125 features</span>
46
+ <span class="toggle-icon">+</span>
47
+ </div>
48
+ <div class="collapsible-content">
49
+ <div class="section-subtitle">All features extracted from the URL string. Hover over any feature to see its description.</div>
50
+
51
+ <div class="feature-category-label">Length &amp; Structure</div>
52
+ <div class="feature-grid">
53
+ <div class="feature-chip" data-tip="Total character count of the full URL">url_length</div>
54
+ <div class="feature-chip" data-tip="Character count of the domain name only">domain_length</div>
55
+ <div class="feature-chip" data-tip="Character count of the URL path component">path_length</div>
56
+ <div class="feature-chip" data-tip="Character count of the query string">query_length</div>
57
+ <div class="feature-chip" data-tip="URL length bucket: 0=short (<40), 1=medium, 2=long, 3=very long (>120)">url_length_category</div>
58
+ <div class="feature-chip" data-tip="Domain length bucket: 0=short (<10), 1=medium, 2=long, 3=very long (>30)">domain_length_category</div>
59
+ </div>
60
+
61
+ <div class="feature-category-label">Character Counts</div>
62
+ <div class="feature-grid">
63
+ <div class="feature-chip" data-tip="Number of dots (.) in the full URL">num_dots</div>
64
+ <div class="feature-chip" data-tip="Number of hyphens (-) in the full URL">num_hyphens</div>
65
+ <div class="feature-chip" data-tip="Number of underscores (_) in the full URL">num_underscores</div>
66
+ <div class="feature-chip" data-tip="Number of forward slashes (/) in the URL">num_slashes</div>
67
+ <div class="feature-chip" data-tip="Number of question marks (?) in the URL">num_question_marks</div>
68
+ <div class="feature-chip" data-tip="Number of ampersands (&amp;) in the URL">num_ampersands</div>
69
+ <div class="feature-chip" data-tip="Number of equals signs (=) in the URL">num_equals</div>
70
+ <div class="feature-chip" data-tip="Number of @ symbols — often used to obscure the real destination">num_at</div>
71
+ <div class="feature-chip" data-tip="Number of percent (%) characters indicating URL encoding">num_percent</div>
72
+ <div class="feature-chip" data-tip="Total digit characters in the full URL">num_digits_url</div>
73
+ <div class="feature-chip" data-tip="Total letter characters in the full URL">num_letters_url</div>
74
+ <div class="feature-chip" data-tip="Number of dots (.) in the domain only">domain_dots</div>
75
+ <div class="feature-chip" data-tip="Number of hyphens (-) in the domain only">domain_hyphens</div>
76
+ <div class="feature-chip" data-tip="Number of digit characters in the domain">domain_digits</div>
77
+ <div class="feature-chip" data-tip="Number of slashes (/) in the path component">path_slashes</div>
78
+ <div class="feature-chip" data-tip="Number of dots (.) in the path component">path_dots</div>
79
+ <div class="feature-chip" data-tip="Number of digit characters in the path">path_digits</div>
80
+ </div>
81
+
82
+ <div class="feature-category-label">Character Ratios</div>
83
+ <div class="feature-grid">
84
+ <div class="feature-chip" data-tip="Proportion of digit characters to total URL length">digit_ratio_url</div>
85
+ <div class="feature-chip" data-tip="Proportion of letter characters to total URL length">letter_ratio_url</div>
86
+ <div class="feature-chip" data-tip="Proportion of special (non-alphanumeric) characters in URL">special_char_ratio</div>
87
+ <div class="feature-chip" data-tip="Proportion of digits in the domain name">digit_ratio_domain</div>
88
+ <div class="feature-chip" data-tip="Proportion of symbols (hyphens, underscores, dots) in domain">symbol_ratio_domain</div>
89
+ </div>
90
+
91
+ <div class="feature-category-label">Domain Structure</div>
92
+ <div class="feature-grid">
93
+ <div class="feature-chip" data-tip="Number of subdomains (e.g. sub.example.com = 1)">num_subdomains</div>
94
+ <div class="feature-chip" data-tip="Total number of dot-separated domain parts">num_domain_parts</div>
95
+ <div class="feature-chip" data-tip="Character length of the top-level domain (e.g. com=3)">tld_length</div>
96
+ <div class="feature-chip" data-tip="Character length of the second-level domain">sld_length</div>
97
+ <div class="feature-chip" data-tip="Length of the longest dot-separated domain part">longest_domain_part</div>
98
+ <div class="feature-chip" data-tip="Average length of all domain parts">avg_domain_part_len</div>
99
+ <div class="feature-chip" data-tip="1 if any domain part exceeds 20 characters">longest_part_gt_20</div>
100
+ <div class="feature-chip" data-tip="1 if any domain part exceeds 30 characters">longest_part_gt_30</div>
101
+ <div class="feature-chip" data-tip="1 if any domain part exceeds 40 characters">longest_part_gt_40</div>
102
+ <div class="feature-chip" data-tip="1 if TLD is suspicious (.tk, .ml, .xyz, .top, .zip, etc.)">has_suspicious_tld</div>
103
+ <div class="feature-chip" data-tip="1 if TLD is well-known and trusted (.com, .org, .edu, etc.)">has_trusted_tld</div>
104
+ <div class="feature-chip" data-tip="1 if URL contains a port number">has_port</div>
105
+ <div class="feature-chip" data-tip="1 if URL uses a non-standard port (not 80 or 443)">has_non_std_port</div>
106
+ <div class="feature-chip" data-tip="Composite randomness score of the domain (0-1)">domain_randomness_score</div>
107
+ <div class="feature-chip" data-tip="Consonant clustering score of the SLD — random strings have high clusters">sld_consonant_cluster_score</div>
108
+ <div class="feature-chip" data-tip="1 if SLD contains keyboard walk patterns (qwerty, asdfgh)">sld_keyboard_pattern</div>
109
+ <div class="feature-chip" data-tip="1 if SLD contains a common English word (4+ characters)">sld_has_dictionary_word</div>
110
+ <div class="feature-chip" data-tip="Score based on vowel/consonant alternation — real words are more pronounceable">sld_pronounceability_score</div>
111
+ <div class="feature-chip" data-tip="1 if digits appear at suspicious positions in the SLD (start or end)">domain_digit_position_suspicious</div>
112
+ </div>
113
+
114
+ <div class="feature-category-label">Path Analysis</div>
115
+ <div class="feature-grid">
116
+ <div class="feature-chip" data-tip="Directory depth of the URL path (number of segments)">path_depth</div>
117
+ <div class="feature-chip" data-tip="Length of the longest path segment between slashes">max_path_segment_len</div>
118
+ <div class="feature-chip" data-tip="Average length of path segments">avg_path_segment_len</div>
119
+ <div class="feature-chip" data-tip="1 if the URL path has a file extension">has_extension</div>
120
+ <div class="feature-chip" data-tip="Category of file extension: 0=none, 1=document, 2=media, 3=executable, 4=web, 5=other">extension_category</div>
121
+ <div class="feature-chip" data-tip="1 if extension is suspicious (.exe, .bat, .cmd, .scr, .vbs, .ps1)">has_suspicious_extension</div>
122
+ <div class="feature-chip" data-tip="1 if extension is specifically .exe">has_exe</div>
123
+ <div class="feature-chip" data-tip="1 if path contains double slash (//) — possible redirect trick">has_double_slash</div>
124
+ <div class="feature-chip" data-tip="1 if a brand name appears in path but not in domain — impersonation signal">path_has_brand_not_domain</div>
125
+ <div class="feature-chip" data-tip="1 if path contains an IP address pattern">path_has_ip_pattern</div>
126
+ <div class="feature-chip" data-tip="1 if a document extension + 'download' keyword appear together">suspicious_path_extension_combo</div>
127
+ </div>
128
+
129
+ <div class="feature-category-label">Query String</div>
130
+ <div class="feature-grid">
131
+ <div class="feature-chip" data-tip="Number of query parameters">num_params</div>
132
+ <div class="feature-chip" data-tip="1 if URL has a query string">has_query</div>
133
+ <div class="feature-chip" data-tip="Total character length of all query parameter values">query_value_length</div>
134
+ <div class="feature-chip" data-tip="Length of the longest query parameter">max_param_len</div>
135
+ <div class="feature-chip" data-tip="1 if any query parameter value looks like a URL — possible redirect">query_has_url</div>
136
+ </div>
137
+
138
+ <div class="feature-category-label">Statistical &amp; Entropy</div>
139
+ <div class="feature-grid">
140
+ <div class="feature-chip" data-tip="Shannon entropy of the full URL — random/phishing URLs have higher entropy">url_entropy</div>
141
+ <div class="feature-chip" data-tip="Shannon entropy of the domain name">domain_entropy</div>
142
+ <div class="feature-chip" data-tip="Shannon entropy of the path component">path_entropy</div>
143
+ <div class="feature-chip" data-tip="Longest run of consecutive digit characters">max_consecutive_digits</div>
144
+ <div class="feature-chip" data-tip="Longest run of consecutive letter characters">max_consecutive_chars</div>
145
+ <div class="feature-chip" data-tip="Longest run of consecutive consonants in domain">max_consecutive_consonants</div>
146
+ <div class="feature-chip" data-tip="Rate of adjacent character repetitions (aa, bb, etc.)">char_repeat_rate</div>
147
+ <div class="feature-chip" data-tip="Ratio of unique bigrams — lower = more repetitive URL">unique_bigram_ratio</div>
148
+ <div class="feature-chip" data-tip="Ratio of unique trigrams — lower = more repetitive URL">unique_trigram_ratio</div>
149
+ <div class="feature-chip" data-tip="Ratio of unique characters to total characters in the SLD (0–1)">sld_letter_diversity</div>
150
+ <div class="feature-chip" data-tip="1 if domain contains both digits and letters">domain_has_numbers_letters</div>
151
+ <div class="feature-chip" data-tip="Composite score (0–1) combining URL length, dots, hyphens, slashes, and entropy">url_complexity_score</div>
152
+ </div>
153
+
154
+ <div class="feature-category-label">Security Indicators</div>
155
+ <div class="feature-grid">
156
+ <div class="feature-chip" data-tip="1 if domain is an IP address instead of a hostname">has_ip_address</div>
157
+ <div class="feature-chip" data-tip="1 if URL contains @ symbol — can trick browsers into treating text before @ as user info">has_at_symbol</div>
158
+ <div class="feature-chip" data-tip="1 if double-slash redirect pattern found in path">has_redirect</div>
159
+ <div class="feature-chip" data-tip="1 if domain is a known URL shortener (bit.ly, t.co, etc.)">is_shortened</div>
160
+ <div class="feature-chip" data-tip="1 if hosted on a free hosting service (000webhostapp, freehosting, etc.)">is_free_hosting</div>
161
+ <div class="feature-chip" data-tip="1 if hosted on a free platform (github.io, vercel.app, netlify.app, etc.)">is_free_platform</div>
162
+ <div class="feature-chip" data-tip="Length of subdomain on free platforms — long random subdomains are suspicious">platform_subdomain_length</div>
163
+ <div class="feature-chip" data-tip="1 if subdomain matches UUID-like pattern (common on Replit, Firebase)">has_uuid_subdomain</div>
164
+ <div class="feature-chip" data-tip="1 if URL uses HTTP instead of HTTPS">is_http</div>
165
+ </div>
166
+
167
+ <div class="feature-category-label">Keywords &amp; Brand Detection</div>
168
+ <div class="feature-grid">
169
+ <div class="feature-chip" data-tip="Count of phishing keywords (login, verify, secure, etc.) in URL">num_phishing_keywords</div>
170
+ <div class="feature-chip" data-tip="1 if phishing keywords found in the domain">phishing_in_domain</div>
171
+ <div class="feature-chip" data-tip="1 if phishing keywords found in the path">phishing_in_path</div>
172
+ <div class="feature-chip" data-tip="Number of recognized brand names in the URL">num_brands</div>
173
+ <div class="feature-chip" data-tip="1 if a brand name appears in the domain">brand_in_domain</div>
174
+ <div class="feature-chip" data-tip="1 if a brand name appears in the path">brand_in_path</div>
175
+ <div class="feature-chip" data-tip="Score for brand impersonation — brand in URL but not the real domain">brand_impersonation</div>
176
+ <div class="feature-chip" data-tip="1 if 'login' appears in URL">has_login</div>
177
+ <div class="feature-chip" data-tip="1 if 'account' appears in URL">has_account</div>
178
+ <div class="feature-chip" data-tip="1 if 'verify' appears in URL">has_verify</div>
179
+ <div class="feature-chip" data-tip="1 if 'secure' appears in URL">has_secure</div>
180
+ <div class="feature-chip" data-tip="1 if 'update' appears in URL">has_update</div>
181
+ <div class="feature-chip" data-tip="1 if 'bank' appears in URL">has_bank</div>
182
+ <div class="feature-chip" data-tip="1 if 'password' or 'passwd' appears in URL">has_password</div>
183
+ <div class="feature-chip" data-tip="1 if 'suspend' appears in URL">has_suspend</div>
184
+ <div class="feature-chip" data-tip="1 if 'webscr' appears in URL — common in PayPal phishing">has_webscr</div>
185
+ <div class="feature-chip" data-tip="1 if 'cmd=' or '/cmd/' appears in URL">has_cmd</div>
186
+ <div class="feature-chip" data-tip="1 if 'cgi-bin' or '.cgi' appears in URL">has_cgi</div>
187
+ <div class="feature-chip" data-tip="1 if brand name in subdomain but not main domain — spoofing pattern">brand_in_subdomain_not_domain</div>
188
+ <div class="feature-chip" data-tip="1 if multiple different brand names detected in URL">multiple_brands_in_url</div>
189
+ <div class="feature-chip" data-tip="1 if brand name combined with hyphen (e.g. paypal-login.com)">brand_with_hyphen</div>
190
+ <div class="feature-chip" data-tip="1 if brand found in domain with suspicious TLD">suspicious_brand_tld</div>
191
+ <div class="feature-chip" data-tip="1 if brand name + phishing keyword both present">brand_keyword_combo</div>
192
+ </div>
193
+
194
+ <div class="feature-category-label">Encoding &amp; Obfuscation</div>
195
+ <div class="feature-grid">
196
+ <div class="feature-chip" data-tip="1 if URL contains percent-encoded characters">has_url_encoding</div>
197
+ <div class="feature-chip" data-tip="Number of percent-encoded sequences in URL">encoding_count</div>
198
+ <div class="feature-chip" data-tip="Difference in length between encoded and decoded URL">encoding_diff</div>
199
+ <div class="feature-chip" data-tip="1 if domain contains Punycode (xn-- prefix) — internationalized domain">has_punycode</div>
200
+ <div class="feature-chip" data-tip="1 if URL contains non-ASCII Unicode characters">has_unicode</div>
201
+ <div class="feature-chip" data-tip="1 if URL contains hexadecimal string (0x...)">has_hex_string</div>
202
+ <div class="feature-chip" data-tip="1 if URL contains a Base64-like string (20+ alphanumeric chars with +/=)">has_base64</div>
203
+ <div class="feature-chip" data-tip="1 if domain contains look-alike patterns (rn, vv, cl, 0, 1) that mimic other characters">has_lookalike_chars</div>
204
+ <div class="feature-chip" data-tip="Score for mixed Unicode scripts in domain — homograph attack indicator">mixed_script_score</div>
205
+ <div class="feature-chip" data-tip="Risk score for homograph attacks targeting brand names">homograph_brand_risk</div>
206
+ <div class="feature-chip" data-tip="1 if IDN homograph score exceeds 0.5 threshold">suspected_idn_homograph</div>
207
+ <div class="feature-chip" data-tip="1 if URL contains double percent-encoding (%% or %25)">double_encoding</div>
208
+ <div class="feature-chip" data-tip="1 if percent-encoding found specifically in the domain">encoding_in_domain</div>
209
+ <div class="feature-chip" data-tip="Count of suspicious Unicode characters (RTL override, zero-width, BOM), capped at 5">suspicious_unicode_category</div>
210
+ </div>
211
+ </div>
212
+ </section>
213
+
214
+ <!-- HTML FEATURES -->
215
+ <section class="section">
216
+ <div class="section-title collapsible-toggle" onclick="toggleFeatures(this)">
217
+ RAW HTML Features <span class="feature-count">77 raw features + 23 engineered</span>
218
+ <span class="toggle-icon">+</span>
219
+ </div>
220
+ <div class="collapsible-content">
221
+ <div class="section-subtitle">All features extracted from HTML source and DOM structure. Hover over any feature to see its description.</div>
222
+
223
+ <div class="feature-category-label">Document Size &amp; Text</div>
224
+ <div class="feature-grid">
225
+ <div class="feature-chip" data-tip="Maximum nesting depth of DOM elements">dom_depth</div>
226
+ <div class="feature-chip" data-tip="Total character length of the raw HTML">html_length</div>
227
+ <div class="feature-chip" data-tip="Total length of visible extracted text">text_length</div>
228
+ <div class="feature-chip" data-tip="Number of words extracted from page text">num_words</div>
229
+ <div class="feature-chip" data-tip="Ratio of text content length to full HTML length">text_to_html_ratio</div>
230
+ <div class="feature-chip" data-tip="Total character length of inline CSS styles">inline_css_length</div>
231
+ <div class="feature-chip" data-tip="Total number of HTML tags in the document">num_tags</div>
232
+ </div>
233
+
234
+ <div class="feature-category-label">Metadata &amp; Page Identity</div>
235
+ <div class="feature-grid">
236
+ <div class="feature-chip" data-tip="1 if HTML title tag is present">has_title</div>
237
+ <div class="feature-chip" data-tip="1 if meta description tag is present">has_description</div>
238
+ <div class="feature-chip" data-tip="1 if meta keywords tag is present">has_keywords</div>
239
+ <div class="feature-chip" data-tip="1 if author metadata is present">has_author</div>
240
+ <div class="feature-chip" data-tip="1 if copyright text or metadata is detected">has_copyright</div>
241
+ <div class="feature-chip" data-tip="1 if viewport meta tag is present">has_viewport</div>
242
+ <div class="feature-chip" data-tip="1 if favicon link is declared">has_favicon</div>
243
+ <div class="feature-chip" data-tip="Number of meta tags in the page head">num_meta_tags</div>
244
+ </div>
245
+
246
+ <div class="feature-category-label">DOM Elements &amp; Layout</div>
247
+ <div class="feature-grid">
248
+ <div class="feature-chip" data-tip="Number of div elements">num_divs</div>
249
+ <div class="feature-chip" data-tip="Number of span elements">num_spans</div>
250
+ <div class="feature-chip" data-tip="Number of paragraph tags">num_paragraphs</div>
251
+ <div class="feature-chip" data-tip="Number of heading tags (h1-h6)">num_headings</div>
252
+ <div class="feature-chip" data-tip="Number of list containers (ul, ol)">num_lists</div>
253
+ <div class="feature-chip" data-tip="Number of table elements">num_tables</div>
254
+ <div class="feature-chip" data-tip="Number of image elements">num_images</div>
255
+ <div class="feature-chip" data-tip="Number of iframe elements">num_iframes</div>
256
+ <div class="feature-chip" data-tip="Number of hidden iframes">num_hidden_iframes</div>
257
+ <div class="feature-chip" data-tip="Number of images embedded as data URIs">num_data_uri_images</div>
258
+ <div class="feature-chip" data-tip="Number of linked CSS files">num_css_files</div>
259
+ <div class="feature-chip" data-tip="Number of script tags">num_scripts</div>
260
+ <div class="feature-chip" data-tip="Number of inline script blocks">num_inline_scripts</div>
261
+ <div class="feature-chip" data-tip="Number of inline style blocks or style attributes">num_inline_styles</div>
262
+ <div class="feature-chip" data-tip="Number of input fields in forms">num_input_fields</div>
263
+ </div>
264
+
265
+ <div class="feature-category-label">Link &amp; Resource Analysis</div>
266
+ <div class="feature-grid">
267
+ <div class="feature-chip" data-tip="Total number of links">num_links</div>
268
+ <div class="feature-chip" data-tip="Number of internal links pointing to same domain">num_internal_links</div>
269
+ <div class="feature-chip" data-tip="Number of external links pointing to other domains">num_external_links</div>
270
+ <div class="feature-chip" data-tip="Ratio of external links to all links">ratio_external_links</div>
271
+ <div class="feature-chip" data-tip="Count of distinct external domains referenced">num_unique_external_domains</div>
272
+ <div class="feature-chip" data-tip="Number of mailto links">num_mailto_links</div>
273
+ <div class="feature-chip" data-tip="Number of javascript: pseudo-links">num_javascript_links</div>
274
+ <div class="feature-chip" data-tip="Number of links pointing to IP-based hosts">num_ip_based_links</div>
275
+ <div class="feature-chip" data-tip="Number of links using suspicious top-level domains">num_suspicious_tld_links</div>
276
+ <div class="feature-chip" data-tip="Number of links with empty or placeholder href values">num_empty_links</div>
277
+ <div class="feature-chip" data-tip="Anchor text points to content unrelated to destination">num_anchor_text_mismatch</div>
278
+ <div class="feature-chip" data-tip="Number of external stylesheet references">num_external_css</div>
279
+ <div class="feature-chip" data-tip="Number of externally loaded images">num_external_images</div>
280
+ <div class="feature-chip" data-tip="Number of externally loaded JavaScript files">num_external_scripts</div>
281
+ </div>
282
+
283
+ <div class="feature-category-label">Forms &amp; Inputs</div>
284
+ <div class="feature-grid">
285
+ <div class="feature-chip" data-tip="1 if at least one form tag exists">has_form</div>
286
+ <div class="feature-chip" data-tip="1 if a login-style form is detected">has_login_form</div>
287
+ <div class="feature-chip" data-tip="Number of form elements">num_forms</div>
288
+ <div class="feature-chip" data-tip="Number of email-type fields">num_email_fields</div>
289
+ <div class="feature-chip" data-tip="Number of password fields">num_password_fields</div>
290
+ <div class="feature-chip" data-tip="Number of text input fields">num_text_fields</div>
291
+ <div class="feature-chip" data-tip="Number of submit buttons">num_submit_buttons</div>
292
+ <div class="feature-chip" data-tip="Number of hidden input fields">num_hidden_fields</div>
293
+ <div class="feature-chip" data-tip="Number of forms missing associated labels">num_forms_without_labels</div>
294
+ <div class="feature-chip" data-tip="Number of forms with empty action attribute">num_empty_form_actions</div>
295
+ <div class="feature-chip" data-tip="Number of forms submitting to external domains">num_external_form_actions</div>
296
+ <div class="feature-chip" data-tip="1 if password form submits to external domain">password_with_external_action</div>
297
+ </div>
298
+
299
+ <div class="feature-category-label">Scripts &amp; Dynamic Behavior</div>
300
+ <div class="feature-grid">
301
+ <div class="feature-chip" data-tip="1 if JavaScript eval() is used">has_eval</div>
302
+ <div class="feature-chip" data-tip="1 if JavaScript escape() is used">has_escape</div>
303
+ <div class="feature-chip" data-tip="1 if JavaScript unescape() is used">has_unescape</div>
304
+ <div class="feature-chip" data-tip="1 if atob() decoding function is present">has_atob</div>
305
+ <div class="feature-chip" data-tip="1 if Base64-like content or decoding usage is detected">has_base64</div>
306
+ <div class="feature-chip" data-tip="1 if String.fromCharCode usage is present">has_fromcharcode</div>
307
+ <div class="feature-chip" data-tip="1 if document.write() is used">has_document_write</div>
308
+ <div class="feature-chip" data-tip="1 if window.open() is used">has_window_open</div>
309
+ <div class="feature-chip" data-tip="1 if location.replace() redirects are used">has_location_replace</div>
310
+ <div class="feature-chip" data-tip="1 if meta refresh redirect is present">has_meta_refresh</div>
311
+ <div class="feature-chip" data-tip="Number of onclick event handlers">num_onclick_events</div>
312
+ <div class="feature-chip" data-tip="Number of onload event handlers">num_onload_events</div>
313
+ <div class="feature-chip" data-tip="Number of onerror event handlers">num_onerror_events</div>
314
+ </div>
315
+
316
+ <div class="feature-category-label">Visibility &amp; Interaction Tricks</div>
317
+ <div class="feature-grid">
318
+ <div class="feature-chip" data-tip="1 if display:none usage is detected">has_display_none</div>
319
+ <div class="feature-chip" data-tip="1 if visibility:hidden usage is detected">has_visibility_hidden</div>
320
+ <div class="feature-chip" data-tip="1 if right-click is disabled by script">has_right_click_disabled</div>
321
+ <div class="feature-chip" data-tip="1 if status bar text customization is attempted">has_status_bar_customization</div>
322
+ </div>
323
+
324
+ <div class="feature-category-label">Contact &amp; Social Engineering Signals</div>
325
+ <div class="feature-grid">
326
+ <div class="feature-chip" data-tip="1 if raw email address patterns appear in HTML">has_email_address</div>
327
+ <div class="feature-chip" data-tip="1 if phone number patterns appear in HTML">has_phone_number</div>
328
+ <div class="feature-chip" data-tip="Number of known brand name mentions in HTML text">num_brand_mentions</div>
329
+ <div class="feature-chip" data-tip="Number of urgency words (urgent, verify, immediately, etc.)">num_urgency_keywords</div>
330
+ </div>
331
+
332
+ <div class="feature-category-label">Ratios &amp; Proportions</div>
333
+ <div class="feature-grid">
334
+ <div class="feature-chip" data-tip="Number of forms divided by number of input fields — phishing sites often have few forms with many inputs">forms_to_inputs_ratio</div>
335
+ <div class="feature-chip" data-tip="Proportion of external links to total links — high ratio indicates off-site redirection">external_to_total_links</div>
336
+ <div class="feature-chip" data-tip="Number of scripts divided by total HTML tags — high ratio suggests heavy JavaScript reliance">scripts_to_tags_ratio</div>
337
+ <div class="feature-chip" data-tip="Ratio of hidden to visible input fields — hidden fields often used for tracking or obfuscation">hidden_to_visible_inputs</div>
338
+ <div class="feature-chip" data-tip="Number of password fields divided by total input fields — phishing forms maximize password collection">password_to_inputs_ratio</div>
339
+ <div class="feature-chip" data-tip="Proportion of empty/placeholder links to all links — broken or disguised navigation is suspicious">empty_to_total_links</div>
340
+ <div class="feature-chip" data-tip="Number of images divided by total HTML tags — low image ratio suggests text-heavy phishing pages">images_to_tags_ratio</div>
341
+ <div class="feature-chip" data-tip="Number of iframes divided by total HTML tags — iframes can hide malicious content or redirects">iframes_to_tags_ratio</div>
342
+ </div>
343
+
344
+ <div class="feature-category-label">Co-occurrence Interactions</div>
345
+ <div class="feature-grid">
346
+ <div class="feature-chip" data-tip="1 if forms exist AND password fields are present — core phishing indicator">forms_with_passwords</div>
347
+ <div class="feature-chip" data-tip="Count of external JavaScript files — malicious JS often hosted externally">external_scripts_links</div>
348
+ <div class="feature-chip" data-tip="1 if urgency keywords AND forms both present — common social engineering pattern">urgency_with_forms</div>
349
+ <div class="feature-chip" data-tip="1 if brand names AND forms both present — brand impersonation with credential harvesting">brand_with_forms</div>
350
+ <div class="feature-chip" data-tip="1 if iframes AND script tags both present — hidden iframes + JS often for malware">iframes_with_scripts</div>
351
+ <div class="feature-chip" data-tip="1 if hidden inputs AND external resources both present — obfuscated tracking/redirects">hidden_with_external</div>
352
+ </div>
353
+
354
+ <div class="feature-category-label">Content Density</div>
355
+ <div class="feature-grid">
356
+ <div class="feature-chip" data-tip="Ratio of visible text length to DOM depth — legitimate sites have better content distribution">content_density</div>
357
+ <div class="feature-chip" data-tip="Number of form elements divided by total words — form-heavy pages are suspicious">form_density</div>
358
+ <div class="feature-chip" data-tip="Number of script tags divided by number of forms — scripts per form indicates obfuscation level">scripts_per_form</div>
359
+ <div class="feature-chip" data-tip="Number of links divided by total words — link density compared to content volume">links_per_word</div>
360
+ </div>
361
+
362
+ <div class="feature-category-label">Composite Risk Scores</div>
363
+ <div class="feature-grid">
364
+ <div class="feature-chip" data-tip="Composite score (0–1) combining form presence, password fields, external links, and scripts">phishing_risk_score</div>
365
+ <div class="feature-chip" data-tip="Score (0–1) specifically for form-based threats: password fields, hidden inputs, external actions">form_risk_score</div>
366
+ <div class="feature-chip" data-tip="Score (0–1) measuring obfuscation techniques: hidden fields, eval usage, encoding, meta refreshes">obfuscation_score</div>
367
+ <div class="feature-chip" data-tip="Score (0–1) measuring legitimacy signals: metadata presence, proper structure, internal links">legitimacy_score</div>
368
+ </div>
369
+
370
+ <div class="feature-category-label">Boolean Flags</div>
371
+ <div class="feature-grid">
372
+ <div class="feature-chip" data-tip="1 if any combination of suspicious elements detected (hidden inputs + eval, eval + meta refresh, etc.)">has_suspicious_elements</div>
373
+ </div>
374
+ </div>
375
+
376
+ </section>
377
+
378
+ <!-- TABS: MODEL CATEGORIES -->
379
+ <section class="section" style="border-bottom:none;padding-bottom:0">
380
+ <div class="tabs">
381
+ <button class="tab active" onclick="switchTab(event,'urlModels')">URL Models <span class="tab-count">3</span></button>
382
+ <button class="tab" onclick="switchTab(event,'htmlModels')">HTML Models <span class="tab-count">2</span></button>
383
+ <button class="tab" onclick="switchTab(event,'combinedModels')">Combined <span class="tab-count">2</span></button>
384
+ <button class="tab" onclick="switchTab(event,'cnnModels')">CNN <span class="tab-count">2</span></button>
385
+ <button class="tab" onclick="switchTab(event,'overview')">Comparison</button>
386
+ </div>
387
+
388
+ <!-- URL MODELS TAB -->
389
+ <div id="urlModels" class="tab-content active">
390
+ <div class="section-subtitle">
391
+ 3 models trained on 120 URL-based features extracted from the URL string structure,
392
+ domain properties, encoding analysis, and brand impersonation detection.
393
+ </div>
394
+
395
+ <!-- Logistic Regression -->
396
+ <div class="model-detail">
397
+ <div class="model-detail-header">
398
+ <div class="model-detail-name">Logistic Regression</div>
399
+ <div class="model-detail-type">Baseline</div>
400
+ </div>
401
+ <div class="metrics-grid">
402
+ <div class="metric-card">
403
+ <div class="metric-value">93.71%</div>
404
+ <div class="metric-label">Accuracy</div>
405
+ </div>
406
+ <div class="metric-card">
407
+ <div class="metric-value">95.40%</div>
408
+ <div class="metric-label">Precision</div>
409
+ </div>
410
+ <div class="metric-card">
411
+ <div class="metric-value">91.84%</div>
412
+ <div class="metric-label">Recall</div>
413
+ </div>
414
+ <div class="metric-card">
415
+ <div class="metric-value">93.59%</div>
416
+ <div class="metric-label">F1-Score</div>
417
+ </div>
418
+ <div class="metric-card">
419
+ <div class="metric-value highlight">0.9789</div>
420
+ <div class="metric-label">ROC-AUC</div>
421
+ </div>
422
+ </div>
423
+ <div class="section-title">Confusion Matrix</div>
424
+ <div class="confusion-matrix">
425
+ <div class="cm-header"></div>
426
+ <div class="cm-header">Pred Legit</div>
427
+ <div class="cm-header">Pred Phish</div>
428
+ <div class="cm-label">Actual Legit</div>
429
+ <div class="cm-cell cm-tn">10,326</div>
430
+ <div class="cm-cell cm-fp">478</div>
431
+ <div class="cm-label">Actual Phish</div>
432
+ <div class="cm-cell cm-fn">881</div>
433
+ <div class="cm-cell cm-tp">9,922</div>
434
+ </div>
435
+ </div>
436
+
437
+ <!-- Random Forest -->
438
+ <div class="model-detail">
439
+ <div class="model-detail-header">
440
+ <div class="model-detail-name">Random Forest</div>
441
+ <div class="model-detail-type">Ensemble</div>
442
+ </div>
443
+ <div class="metrics-grid">
444
+ <div class="metric-card">
445
+ <div class="metric-value">97.63%</div>
446
+ <div class="metric-label">Accuracy</div>
447
+ </div>
448
+ <div class="metric-card">
449
+ <div class="metric-value">99.01%</div>
450
+ <div class="metric-label">Precision</div>
451
+ </div>
452
+ <div class="metric-card">
453
+ <div class="metric-value">96.22%</div>
454
+ <div class="metric-label">Recall</div>
455
+ </div>
456
+ <div class="metric-card">
457
+ <div class="metric-value">97.60%</div>
458
+ <div class="metric-label">F1-Score</div>
459
+ </div>
460
+ <div class="metric-card">
461
+ <div class="metric-value highlight">0.9958</div>
462
+ <div class="metric-label">ROC-AUC</div>
463
+ </div>
464
+ </div>
465
+ <div class="section-title">Confusion Matrix</div>
466
+ <div class="confusion-matrix">
467
+ <div class="cm-header"></div>
468
+ <div class="cm-header">Pred Legit</div>
469
+ <div class="cm-header">Pred Phish</div>
470
+ <div class="cm-label">Actual Legit</div>
471
+ <div class="cm-cell cm-tn">10,700</div>
472
+ <div class="cm-cell cm-fp">104</div>
473
+ <div class="cm-label">Actual Phish</div>
474
+ <div class="cm-cell cm-fn">408</div>
475
+ <div class="cm-cell cm-tp">10,395</div>
476
+ </div>
477
+ <div class="subsection">
478
+ <div class="section-title">Top 20 Features by Importance</div>
479
+ <div class="features-list">
480
+ <div class="feature-row"><span class="feature-rank">1</span><span class="feature-name">domain_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:100%"></div></div><span class="feature-importance">0.0500</span></div>
481
+ <div class="feature-row"><span class="feature-rank">2</span><span class="feature-name">num_domain_parts</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:94%"></div></div><span class="feature-importance">0.0471</span></div>
482
+ <div class="feature-row"><span class="feature-rank">3</span><span class="feature-name">domain_dots</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:91%"></div></div><span class="feature-importance">0.0453</span></div>
483
+ <div class="feature-row"><span class="feature-rank">4</span><span class="feature-name">num_subdomains</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:79%"></div></div><span class="feature-importance">0.0393</span></div>
484
+ <div class="feature-row"><span class="feature-rank">5</span><span class="feature-name">num_dots</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:67%"></div></div><span class="feature-importance">0.0337</span></div>
485
+ <div class="feature-row"><span class="feature-rank">6</span><span class="feature-name">domain_length_category</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:67%"></div></div><span class="feature-importance">0.0335</span></div>
486
+ <div class="feature-row"><span class="feature-rank">7</span><span class="feature-name">symbol_ratio_domain</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:65%"></div></div><span class="feature-importance">0.0324</span></div>
487
+ <div class="feature-row"><span class="feature-rank">8</span><span class="feature-name">digit_ratio_url</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:60%"></div></div><span class="feature-importance">0.0298</span></div>
488
+ <div class="feature-row"><span class="feature-rank">9</span><span class="feature-name">path_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:59%"></div></div><span class="feature-importance">0.0297</span></div>
489
+ <div class="feature-row"><span class="feature-rank">10</span><span class="feature-name">avg_domain_part_len</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:58%"></div></div><span class="feature-importance">0.0292</span></div>
490
+ <div class="feature-row"><span class="feature-rank">11</span><span class="feature-name">num_digits_url</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:57%"></div></div><span class="feature-importance">0.0283</span></div>
491
+ <div class="feature-row"><span class="feature-rank">12</span><span class="feature-name">domain_entropy</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:56%"></div></div><span class="feature-importance">0.0282</span></div>
492
+ <div class="feature-row"><span class="feature-rank">13</span><span class="feature-name">url_entropy</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:53%"></div></div><span class="feature-importance">0.0266</span></div>
493
+ <div class="feature-row"><span class="feature-rank">14</span><span class="feature-name">max_consecutive_digits</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:49%"></div></div><span class="feature-importance">0.0246</span></div>
494
+ <div class="feature-row"><span class="feature-rank">15</span><span class="feature-name">special_char_ratio</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:48%"></div></div><span class="feature-importance">0.0242</span></div>
495
+ <div class="feature-row"><span class="feature-rank">16</span><span class="feature-name">is_shortened</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:47%"></div></div><span class="feature-importance">0.0237</span></div>
496
+ <div class="feature-row"><span class="feature-rank">17</span><span class="feature-name">path_entropy</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:45%"></div></div><span class="feature-importance">0.0225</span></div>
497
+ <div class="feature-row"><span class="feature-rank">18</span><span class="feature-name">max_path_segment_len</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:43%"></div></div><span class="feature-importance">0.0215</span></div>
498
+ <div class="feature-row"><span class="feature-rank">19</span><span class="feature-name">num_letters_url</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:41%"></div></div><span class="feature-importance">0.0206</span></div>
499
+ <div class="feature-row"><span class="feature-rank">20</span><span class="feature-name">path_slashes</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:40%"></div></div><span class="feature-importance">0.0201</span></div>
500
+ </div>
501
+ </div>
502
+ </div>
503
+
504
+ <!-- XGBoost -->
505
+ <div class="model-detail">
506
+ <div class="model-detail-header">
507
+ <div class="model-detail-name">XGBoost</div>
508
+ <div class="model-detail-type">Gradient Boosting</div>
509
+ </div>
510
+ <div class="metrics-grid">
511
+ <div class="metric-card">
512
+ <div class="metric-value">97.85%</div>
513
+ <div class="metric-label">Accuracy</div>
514
+ </div>
515
+ <div class="metric-card">
516
+ <div class="metric-value">99.00%</div>
517
+ <div class="metric-label">Precision</div>
518
+ </div>
519
+ <div class="metric-card">
520
+ <div class="metric-value">96.68%</div>
521
+ <div class="metric-label">Recall</div>
522
+ </div>
523
+ <div class="metric-card">
524
+ <div class="metric-value">97.82%</div>
525
+ <div class="metric-label">F1-Score</div>
526
+ </div>
527
+ <div class="metric-card">
528
+ <div class="metric-value highlight">0.9953</div>
529
+ <div class="metric-label">ROC-AUC</div>
530
+ </div>
531
+ </div>
532
+ <div class="section-title">Confusion Matrix</div>
533
+ <div class="confusion-matrix">
534
+ <div class="cm-header"></div>
535
+ <div class="cm-header">Pred Legit</div>
536
+ <div class="cm-header">Pred Phish</div>
537
+ <div class="cm-label">Actual Legit</div>
538
+ <div class="cm-cell cm-tn">10,698</div>
539
+ <div class="cm-cell cm-fp">106</div>
540
+ <div class="cm-label">Actual Phish</div>
541
+ <div class="cm-cell cm-fn">359</div>
542
+ <div class="cm-cell cm-tp">10,444</div>
543
+ </div>
544
+ <div class="subsection">
545
+ <div class="section-title">Top 20 Features by Importance</div>
546
+ <div class="features-list">
547
+ <div class="feature-row"><span class="feature-rank">1</span><span class="feature-name">domain_dots</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:100%"></div></div><span class="feature-importance">0.2514</span></div>
548
+ <div class="feature-row"><span class="feature-rank">2</span><span class="feature-name">is_shortened</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:60%"></div></div><span class="feature-importance">0.1519</span></div>
549
+ <div class="feature-row"><span class="feature-rank">3</span><span class="feature-name">num_subdomains</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:57%"></div></div><span class="feature-importance">0.1423</span></div>
550
+ <div class="feature-row"><span class="feature-rank">4</span><span class="feature-name">num_domain_parts</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:20%"></div></div><span class="feature-importance">0.0492</span></div>
551
+ <div class="feature-row"><span class="feature-rank">5</span><span class="feature-name">multiple_brands_in_url</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:14%"></div></div><span class="feature-importance">0.0350</span></div>
552
+ <div class="feature-row"><span class="feature-rank">6</span><span class="feature-name">is_free_platform</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:11%"></div></div><span class="feature-importance">0.0281</span></div>
553
+ <div class="feature-row"><span class="feature-rank">7</span><span class="feature-name">domain_hyphens</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:10%"></div></div><span class="feature-importance">0.0252</span></div>
554
+ <div class="feature-row"><span class="feature-rank">8</span><span class="feature-name">path_digits</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:6%"></div></div><span class="feature-importance">0.0149</span></div>
555
+ <div class="feature-row"><span class="feature-rank">9</span><span class="feature-name">is_http</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:6%"></div></div><span class="feature-importance">0.0139</span></div>
556
+ <div class="feature-row"><span class="feature-rank">10</span><span class="feature-name">platform_subdomain_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:5%"></div></div><span class="feature-importance">0.0123</span></div>
557
+ <div class="feature-row"><span class="feature-rank">11</span><span class="feature-name">avg_domain_part_len</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:5%"></div></div><span class="feature-importance">0.0121</span></div>
558
+ <div class="feature-row"><span class="feature-rank">12</span><span class="feature-name">path_slashes</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:5%"></div></div><span class="feature-importance">0.0119</span></div>
559
+ <div class="feature-row"><span class="feature-rank">13</span><span class="feature-name">brand_in_path</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:4%"></div></div><span class="feature-importance">0.0111</span></div>
560
+ <div class="feature-row"><span class="feature-rank">14</span><span class="feature-name">domain_length_category</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:4%"></div></div><span class="feature-importance">0.0108</span></div>
561
+ <div class="feature-row"><span class="feature-rank">15</span><span class="feature-name">domain_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:4%"></div></div><span class="feature-importance">0.0106</span></div>
562
+ <div class="feature-row"><span class="feature-rank">16</span><span class="feature-name">symbol_ratio_domain</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:4%"></div></div><span class="feature-importance">0.0100</span></div>
563
+ <div class="feature-row"><span class="feature-rank">17</span><span class="feature-name">encoding_diff</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0087</span></div>
564
+ <div class="feature-row"><span class="feature-rank">18</span><span class="feature-name">num_brands</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0077</span></div>
565
+ <div class="feature-row"><span class="feature-rank">19</span><span class="feature-name">tld_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0071</span></div>
566
+ <div class="feature-row"><span class="feature-rank">20</span><span class="feature-name">digit_ratio_url</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0067</span></div>
567
+ </div>
568
+ </div>
569
+ </div>
570
+ </div>
571
+
572
+ <!-- HTML MODELS TAB -->
573
+ <div id="htmlModels" class="tab-content">
574
+ <div class="section-subtitle">
575
+ 2 models trained on 100 HTML-based features extracted from the page structure,
576
+ forms, scripts, links, and content analysis of downloaded web pages.
577
+ </div>
578
+
579
+ <!-- Random Forest HTML -->
580
+ <div class="model-detail">
581
+ <div class="model-detail-header">
582
+ <div class="model-detail-name">Random Forest HTML</div>
583
+ <div class="model-detail-type">Ensemble</div>
584
+ </div>
585
+ <div class="metrics-grid">
586
+ <div class="metric-card">
587
+ <div class="metric-value">89.65%</div>
588
+ <div class="metric-label">Accuracy</div>
589
+ </div>
590
+ <div class="metric-card">
591
+ <div class="metric-value">91.78%</div>
592
+ <div class="metric-label">Precision</div>
593
+ </div>
594
+ <div class="metric-card">
595
+ <div class="metric-value">87.11%</div>
596
+ <div class="metric-label">Recall</div>
597
+ </div>
598
+ <div class="metric-card">
599
+ <div class="metric-value">89.38%</div>
600
+ <div class="metric-label">F1-Score</div>
601
+ </div>
602
+ <div class="metric-card">
603
+ <div class="metric-value highlight">0.9617</div>
604
+ <div class="metric-label">ROC-AUC</div>
605
+ </div>
606
+ <div class="metric-card">
607
+ <div class="metric-value">89.05%</div>
608
+ <div class="metric-label">CV F1 (5-fold)</div>
609
+ </div>
610
+ </div>
611
+ <div class="section-title">Confusion Matrix</div>
612
+ <div class="confusion-matrix">
613
+ <div class="cm-header"></div>
614
+ <div class="cm-header">Pred Legit</div>
615
+ <div class="cm-header">Pred Phish</div>
616
+ <div class="cm-label">Actual Legit</div>
617
+ <div class="cm-cell cm-tn">15,012</div>
618
+ <div class="cm-cell cm-fp">1,271</div>
619
+ <div class="cm-label">Actual Phish</div>
620
+ <div class="cm-cell cm-fn">2,099</div>
621
+ <div class="cm-cell cm-tp">14,184</div>
622
+ </div>
623
+ <div class="subsection">
624
+ <div class="section-title">Hyperparameters</div>
625
+ <div class="params-grid">
626
+ <div class="param-item"><span class="param-key">n_estimators</span><span class="param-value">500</span></div>
627
+ <div class="param-item"><span class="param-key">max_depth</span><span class="param-value">35</span></div>
628
+ <div class="param-item"><span class="param-key">min_samples_split</span><span class="param-value">2</span></div>
629
+ <div class="param-item"><span class="param-key">min_samples_leaf</span><span class="param-value">1</span></div>
630
+ <div class="param-item"><span class="param-key">max_features</span><span class="param-value">sqrt</span></div>
631
+ <div class="param-item"><span class="param-key">class_weight</span><span class="param-value">balanced</span></div>
632
+ </div>
633
+ </div>
634
+ </div>
635
+
636
+ <!-- XGBoost HTML -->
637
+ <div class="model-detail">
638
+ <div class="model-detail-header">
639
+ <div class="model-detail-name">XGBoost HTML</div>
640
+ <div class="model-detail-type">Gradient Boosting</div>
641
+ </div>
642
+ <div class="metrics-grid">
643
+ <div class="metric-card">
644
+ <div class="metric-value">89.07%</div>
645
+ <div class="metric-label">Accuracy</div>
646
+ </div>
647
+ <div class="metric-card">
648
+ <div class="metric-value">90.27%</div>
649
+ <div class="metric-label">Precision</div>
650
+ </div>
651
+ <div class="metric-card">
652
+ <div class="metric-value">87.56%</div>
653
+ <div class="metric-label">Recall</div>
654
+ </div>
655
+ <div class="metric-card">
656
+ <div class="metric-value">88.90%</div>
657
+ <div class="metric-label">F1-Score</div>
658
+ </div>
659
+ <div class="metric-card">
660
+ <div class="metric-value highlight">0.9590</div>
661
+ <div class="metric-label">ROC-AUC</div>
662
+ </div>
663
+ <div class="metric-card">
664
+ <div class="metric-value">88.87%</div>
665
+ <div class="metric-label">CV F1 (5-fold)</div>
666
+ </div>
667
+ </div>
668
+ <div class="section-title">Confusion Matrix</div>
669
+ <div class="confusion-matrix">
670
+ <div class="cm-header"></div>
671
+ <div class="cm-header">Pred Legit</div>
672
+ <div class="cm-header">Pred Phish</div>
673
+ <div class="cm-label">Actual Legit</div>
674
+ <div class="cm-cell cm-tn">14,747</div>
675
+ <div class="cm-cell cm-fp">1,536</div>
676
+ <div class="cm-label">Actual Phish</div>
677
+ <div class="cm-cell cm-fn">2,025</div>
678
+ <div class="cm-cell cm-tp">14,258</div>
679
+ </div>
680
+ <div class="subsection">
681
+ <div class="section-title">Hyperparameters</div>
682
+ <div class="params-grid">
683
+ <div class="param-item"><span class="param-key">n_estimators</span><span class="param-value">600</span></div>
684
+ <div class="param-item"><span class="param-key">max_depth</span><span class="param-value">8</span></div>
685
+ <div class="param-item"><span class="param-key">learning_rate</span><span class="param-value">0.05</span></div>
686
+ <div class="param-item"><span class="param-key">subsample</span><span class="param-value">0.8</span></div>
687
+ <div class="param-item"><span class="param-key">colsample_bytree</span><span class="param-value">0.8</span></div>
688
+ <div class="param-item"><span class="param-key">min_child_weight</span><span class="param-value">3</span></div>
689
+ <div class="param-item"><span class="param-key">gamma</span><span class="param-value">0.1</span></div>
690
+ <div class="param-item"><span class="param-key">reg_alpha</span><span class="param-value">0.1</span></div>
691
+ <div class="param-item"><span class="param-key">reg_lambda</span><span class="param-value">1.0</span></div>
692
+ <div class="param-item"><span class="param-key">early_stopping</span><span class="param-value">50 rounds</span></div>
693
+ </div>
694
+ </div>
695
+ </div>
696
+ </div>
697
+
698
+ <!-- COMBINED MODELS TAB -->
699
+ <div id="combinedModels" class="tab-content">
700
+ <div class="section-subtitle">
701
+ 2 models trained on 221 combined features (121 URL + 100 HTML) for maximum detection accuracy.
702
+ </div>
703
+
704
+ <!-- Random Forest Combined -->
705
+ <div class="model-detail">
706
+ <div class="model-detail-header">
707
+ <div class="model-detail-name">Random Forest Combined</div>
708
+ <div class="model-detail-type">Ensemble</div>
709
+ </div>
710
+ <div class="metrics-grid">
711
+ <div class="metric-card">
712
+ <div class="metric-value">98.60%</div>
713
+ <div class="metric-label">Accuracy</div>
714
+ </div>
715
+ <div class="metric-card">
716
+ <div class="metric-value">99.16%</div>
717
+ <div class="metric-label">Precision</div>
718
+ </div>
719
+ <div class="metric-card">
720
+ <div class="metric-value">98.02%</div>
721
+ <div class="metric-label">Recall</div>
722
+ </div>
723
+ <div class="metric-card">
724
+ <div class="metric-value">98.59%</div>
725
+ <div class="metric-label">F1-Score</div>
726
+ </div>
727
+ <div class="metric-card">
728
+ <div class="metric-value highlight">0.9990</div>
729
+ <div class="metric-label">ROC-AUC</div>
730
+ </div>
731
+ <div class="metric-card">
732
+ <div class="metric-value">98.59%</div>
733
+ <div class="metric-label">CV F1 (5-fold)</div>
734
+ </div>
735
+ </div>
736
+ <div class="section-title">Confusion Matrix</div>
737
+ <div class="confusion-matrix">
738
+ <div class="cm-header"></div>
739
+ <div class="cm-header">Pred Legit</div>
740
+ <div class="cm-header">Pred Phish</div>
741
+ <div class="cm-label">Actual Legit</div>
742
+ <div class="cm-cell cm-tn">10,680</div>
743
+ <div class="cm-cell cm-fp">89</div>
744
+ <div class="cm-label">Actual Phish</div>
745
+ <div class="cm-cell cm-fn">213</div>
746
+ <div class="cm-cell cm-tp">10,556</div>
747
+ </div>
748
+ <div class="subsection">
749
+ <div class="section-title">Feature Importance Split</div>
750
+ <div class="feature-split">
751
+ <div class="feature-split-bar">
752
+ <div class="split-url" style="width:29.1%">URL 29.1%</div>
753
+ <div class="split-html" style="width:70.9%">HTML 70.9%</div>
754
+ </div>
755
+ </div>
756
+ </div>
757
+ <div class="subsection">
758
+ <div class="section-title">Top 15 Features by Importance</div>
759
+ <div class="features-list">
760
+ <div class="feature-row"><span class="feature-rank">1</span><span class="feature-name">html_num_links</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:100%"></div></div><span class="feature-importance">0.0640</span></div>
761
+ <div class="feature-row"><span class="feature-rank">2</span><span class="feature-name">html_text_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:90%"></div></div><span class="feature-importance">0.0577</span></div>
762
+ <div class="feature-row"><span class="feature-rank">3</span><span class="feature-name">html_num_tags</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:75%"></div></div><span class="feature-importance">0.0479</span></div>
763
+ <div class="feature-row"><span class="feature-rank">4</span><span class="feature-name">html_num_internal_links</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:72%"></div></div><span class="feature-importance">0.0463</span></div>
764
+ <div class="feature-row"><span class="feature-rank">5</span><span class="feature-name">html_num_words</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:66%"></div></div><span class="feature-importance">0.0422</span></div>
765
+ <div class="feature-row"><span class="feature-rank">6</span><span class="feature-name">html_external_scripts_links</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:56%"></div></div><span class="feature-importance">0.0361</span></div>
766
+ <div class="feature-row"><span class="feature-rank">7</span><span class="feature-name">html_num_divs</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:46%"></div></div><span class="feature-importance">0.0297</span></div>
767
+ <div class="feature-row"><span class="feature-rank">8</span><span class="feature-name">html_num_lists</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:45%"></div></div><span class="feature-importance">0.0291</span></div>
768
+ <div class="feature-row"><span class="feature-rank">9</span><span class="feature-name">html_num_external_links</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:43%"></div></div><span class="feature-importance">0.0276</span></div>
769
+ <div class="feature-row"><span class="feature-rank">10</span><span class="feature-name">html_has_description</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:40%"></div></div><span class="feature-importance">0.0258</span></div>
770
+ <div class="feature-row"><span class="feature-rank">11</span><span class="feature-name">html_num_unique_external_domains</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:37%"></div></div><span class="feature-importance">0.0236</span></div>
771
+ <div class="feature-row"><span class="feature-rank">12</span><span class="feature-name">html_num_images</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:36%"></div></div><span class="feature-importance">0.0231</span></div>
772
+ <div class="feature-row"><span class="feature-rank">13</span><span class="feature-name">html_num_spans</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:35%"></div></div><span class="feature-importance">0.0226</span></div>
773
+ <div class="feature-row"><span class="feature-rank">14</span><span class="feature-name">html_num_headings</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:34%"></div></div><span class="feature-importance">0.0220</span></div>
774
+ <div class="feature-row"><span class="feature-rank">15</span><span class="feature-name">html_dom_depth</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:33%"></div></div><span class="feature-importance">0.0210</span></div>
775
+ </div>
776
+ </div>
777
+ <div class="subsection">
778
+ <div class="section-title">Hyperparameters</div>
779
+ <div class="params-grid">
780
+ <div class="param-item"><span class="param-key">n_estimators</span><span class="param-value">533</span></div>
781
+ <div class="param-item"><span class="param-key">max_depth</span><span class="param-value">43</span></div>
782
+ <div class="param-item"><span class="param-key">min_samples_split</span><span class="param-value">2</span></div>
783
+ <div class="param-item"><span class="param-key">max_features</span><span class="param-value">sqrt</span></div>
784
+ <div class="param-item"><span class="param-key">class_weight</span><span class="param-value">balanced</span></div>
785
+ </div>
786
+ </div>
787
+ </div>
788
+
789
+ <!-- XGBoost Combined -->
790
+ <div class="model-detail">
791
+ <div class="model-detail-header">
792
+ <div class="model-detail-name">XGBoost Combined</div>
793
+ <div class="model-detail-type">Gradient Boosting</div>
794
+ </div>
795
+ <div class="metrics-grid">
796
+ <div class="metric-card">
797
+ <div class="metric-value">99.01%</div>
798
+ <div class="metric-label">Accuracy</div>
799
+ </div>
800
+ <div class="metric-card">
801
+ <div class="metric-value">99.35%</div>
802
+ <div class="metric-label">Precision</div>
803
+ </div>
804
+ <div class="metric-card">
805
+ <div class="metric-value">98.66%</div>
806
+ <div class="metric-label">Recall</div>
807
+ </div>
808
+ <div class="metric-card">
809
+ <div class="metric-value">99.01%</div>
810
+ <div class="metric-label">F1-Score</div>
811
+ </div>
812
+ <div class="metric-card">
813
+ <div class="metric-value highlight">0.9991</div>
814
+ <div class="metric-label">ROC-AUC</div>
815
+ </div>
816
+ <div class="metric-card">
817
+ <div class="metric-value">98.90%</div>
818
+ <div class="metric-label">CV F1 (5-fold)</div>
819
+ </div>
820
+ </div>
821
+ <div class="section-title">Confusion Matrix</div>
822
+ <div class="confusion-matrix">
823
+ <div class="cm-header"></div>
824
+ <div class="cm-header">Pred Legit</div>
825
+ <div class="cm-header">Pred Phish</div>
826
+ <div class="cm-label">Actual Legit</div>
827
+ <div class="cm-cell cm-tn">10,700</div>
828
+ <div class="cm-cell cm-fp">69</div>
829
+ <div class="cm-label">Actual Phish</div>
830
+ <div class="cm-cell cm-fn">144</div>
831
+ <div class="cm-cell cm-tp">10,625</div>
832
+ </div>
833
+ <div class="subsection">
834
+ <div class="section-title">Feature Importance Split</div>
835
+ <div class="feature-split">
836
+ <div class="feature-split-bar">
837
+ <div class="split-url" style="width:37.1%">URL 37.1%</div>
838
+ <div class="split-html" style="width:62.9%">HTML 62.9%</div>
839
+ </div>
840
+ </div>
841
+ </div>
842
+ <div class="subsection">
843
+ <div class="section-title">Top 15 Features by Importance</div>
844
+ <div class="features-list">
845
+ <div class="feature-row"><span class="feature-rank">1</span><span class="feature-name">html_num_links</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:100%"></div></div><span class="feature-importance">0.4420</span></div>
846
+ <div class="feature-row"><span class="feature-rank">2</span><span class="feature-name">url_is_shortened</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:10%"></div></div><span class="feature-importance">0.0427</span></div>
847
+ <div class="feature-row"><span class="feature-rank">3</span><span class="feature-name">url_platform_subdomain_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:9%"></div></div><span class="feature-importance">0.0397</span></div>
848
+ <div class="feature-row"><span class="feature-rank">4</span><span class="feature-name">url_domain_dots</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:7%"></div></div><span class="feature-importance">0.0315</span></div>
849
+ <div class="feature-row"><span class="feature-rank">5</span><span class="feature-name">html_has_fromcharcode</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:7%"></div></div><span class="feature-importance">0.0296</span></div>
850
+ <div class="feature-row"><span class="feature-rank">6</span><span class="feature-name">url_num_domain_parts</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:6%"></div></div><span class="feature-importance">0.0269</span></div>
851
+ <div class="feature-row"><span class="feature-rank">7</span><span class="feature-name">html_has_meta_refresh</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0148</span></div>
852
+ <div class="feature-row"><span class="feature-rank">8</span><span class="feature-name">url_is_http</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0126</span></div>
853
+ <div class="feature-row"><span class="feature-rank">9</span><span class="feature-name">url_encoding_diff</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0124</span></div>
854
+ <div class="feature-row"><span class="feature-rank">10</span><span class="feature-name">url_path_digits</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:3%"></div></div><span class="feature-importance">0.0116</span></div>
855
+ <div class="feature-row"><span class="feature-rank">11</span><span class="feature-name">html_text_length</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:2%"></div></div><span class="feature-importance">0.0107</span></div>
856
+ <div class="feature-row"><span class="feature-rank">12</span><span class="feature-name">url_path_slashes</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:2%"></div></div><span class="feature-importance">0.0105</span></div>
857
+ <div class="feature-row"><span class="feature-rank">13</span><span class="feature-name">url_multiple_brands_in_url</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:2%"></div></div><span class="feature-importance">0.0103</span></div>
858
+ <div class="feature-row"><span class="feature-rank">14</span><span class="feature-name">url_brand_in_path</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:2%"></div></div><span class="feature-importance">0.0102</span></div>
859
+ <div class="feature-row"><span class="feature-rank">15</span><span class="feature-name">url_domain_hyphens</span><div class="importance-bar-bg"><div class="importance-bar-fill" style="width:2%"></div></div><span class="feature-importance">0.0095</span></div>
860
+ </div>
861
+ </div>
862
+ <div class="subsection">
863
+ <div class="section-title">Hyperparameters</div>
864
+ <div class="params-grid">
865
+ <div class="param-item"><span class="param-key">n_estimators</span><span class="param-value">726</span></div>
866
+ <div class="param-item"><span class="param-key">max_depth</span><span class="param-value">6</span></div>
867
+ <div class="param-item"><span class="param-key">learning_rate</span><span class="param-value">0.137</span></div>
868
+ <div class="param-item"><span class="param-key">subsample</span><span class="param-value">0.698</span></div>
869
+ </div>
870
+ </div>
871
+ </div>
872
+ </div>
873
+
874
+ <!-- CNN MODELS TAB -->
875
+ <div id="cnnModels" class="tab-content">
876
+ <div class="section-subtitle">
877
+ 2 character-level CNN models that process raw text directly &mdash; no hand-crafted features needed.
878
+ Parallel Conv1D branches with kernel sizes (3, 5, 7) capture patterns at different scales.
879
+ </div>
880
+
881
+ <!-- CNN URL -->
882
+ <div class="model-detail">
883
+ <div class="model-detail-header">
884
+ <div class="model-detail-name">CNN URL (Char-level)</div>
885
+ <div class="model-detail-type">Deep Learning</div>
886
+ </div>
887
+ <div class="metrics-grid">
888
+ <div class="metric-card">
889
+ <div class="metric-value">98.38%</div>
890
+ <div class="metric-label">Accuracy</div>
891
+ </div>
892
+ <div class="metric-card">
893
+ <div class="metric-value">98.88%</div>
894
+ <div class="metric-label">Precision</div>
895
+ </div>
896
+ <div class="metric-card">
897
+ <div class="metric-value">97.86%</div>
898
+ <div class="metric-label">Recall</div>
899
+ </div>
900
+ <div class="metric-card">
901
+ <div class="metric-value">98.37%</div>
902
+ <div class="metric-label">F1-Score</div>
903
+ </div>
904
+ <div class="metric-card">
905
+ <div class="metric-value highlight">0.9976</div>
906
+ <div class="metric-label">ROC-AUC</div>
907
+ </div>
908
+ </div>
909
+ <div class="section-title">Confusion Matrix</div>
910
+ <div class="confusion-matrix">
911
+ <div class="cm-header"></div>
912
+ <div class="cm-header">Pred Legit</div>
913
+ <div class="cm-header">Pred Phish</div>
914
+ <div class="cm-label">Actual Legit</div>
915
+ <div class="cm-cell cm-tn">8,013</div>
916
+ <div class="cm-cell cm-fp">90</div>
917
+ <div class="cm-label">Actual Phish</div>
918
+ <div class="cm-cell cm-fn">173</div>
919
+ <div class="cm-cell cm-tp">7,930</div>
920
+ </div>
921
+ <div class="subsection">
922
+ <div class="section-title">Architecture</div>
923
+ <div class="params-grid">
924
+ <div class="param-item"><span class="param-key">Input</span><span class="param-value">Raw URL characters</span></div>
925
+ <div class="param-item"><span class="param-key">max_len</span><span class="param-value">800</span></div>
926
+ <div class="param-item"><span class="param-key">vocab_size</span><span class="param-value">87</span></div>
927
+ <div class="param-item"><span class="param-key">Conv1D kernels</span><span class="param-value">3, 5, 7</span></div>
928
+ <div class="param-item"><span class="param-key">Dataset</span><span class="param-value">108,034 URLs</span></div>
929
+ </div>
930
+ </div>
931
+ </div>
932
+
933
+ <!-- CNN HTML -->
934
+ <div class="model-detail">
935
+ <div class="model-detail-header">
936
+ <div class="model-detail-name">CNN HTML (Char-level)</div>
937
+ <div class="model-detail-type">Deep Learning</div>
938
+ </div>
939
+ <div class="metrics-grid">
940
+ <div class="metric-card">
941
+ <div class="metric-value">96.33%</div>
942
+ <div class="metric-label">Accuracy</div>
943
+ </div>
944
+ <div class="metric-card">
945
+ <div class="metric-value">98.18%</div>
946
+ <div class="metric-label">Precision</div>
947
+ </div>
948
+ <div class="metric-card">
949
+ <div class="metric-value">94.41%</div>
950
+ <div class="metric-label">Recall</div>
951
+ </div>
952
+ <div class="metric-card">
953
+ <div class="metric-value">96.26%</div>
954
+ <div class="metric-label">F1-Score</div>
955
+ </div>
956
+ <div class="metric-card">
957
+ <div class="metric-value highlight">0.9908</div>
958
+ <div class="metric-label">ROC-AUC</div>
959
+ </div>
960
+ </div>
961
+ <div class="section-title">Confusion Matrix</div>
962
+ <div class="confusion-matrix">
963
+ <div class="cm-header"></div>
964
+ <div class="cm-header">Pred Legit</div>
965
+ <div class="cm-header">Pred Phish</div>
966
+ <div class="cm-label">Actual Legit</div>
967
+ <div class="cm-cell cm-tn">5,943</div>
968
+ <div class="cm-cell cm-fp">106</div>
969
+ <div class="cm-label">Actual Phish</div>
970
+ <div class="cm-cell cm-fn">338</div>
971
+ <div class="cm-cell cm-tp">5,711</div>
972
+ </div>
973
+ <div class="subsection">
974
+ <div class="section-title">Architecture</div>
975
+ <div class="params-grid">
976
+ <div class="param-item"><span class="param-key">Input</span><span class="param-value">Raw HTML source</span></div>
977
+ <div class="param-item"><span class="param-key">max_len</span><span class="param-value">5,000</span></div>
978
+ <div class="param-item"><span class="param-key">vocab_size</span><span class="param-value">100</span></div>
979
+ <div class="param-item"><span class="param-key">Conv1D kernels</span><span class="param-value">3, 5, 7</span></div>
980
+ <div class="param-item"><span class="param-key">Dataset</span><span class="param-value">80,652 HTML pages</span></div>
981
+ </div>
982
+ </div>
983
+ </div>
984
+ </div>
985
+
986
+ <!-- COMPARISON TAB -->
987
+ <div id="overview" class="tab-content">
988
+ <div class="section-subtitle">
989
+ Side-by-side comparison of all 9 models across URL, HTML, Combined, and CNN categories.
990
+ </div>
991
+
992
+ <div class="section-title">All Models</div>
993
+ <div class="table-scroll">
994
+ <table class="comparison-table">
995
+ <thead>
996
+ <tr>
997
+ <th>Model</th>
998
+ <th>Category</th>
999
+ <th>Accuracy</th>
1000
+ <th>Precision</th>
1001
+ <th>Recall</th>
1002
+ <th>F1-Score</th>
1003
+ <th>ROC-AUC</th>
1004
+ <th>Features</th>
1005
+ </tr>
1006
+ </thead>
1007
+ <tbody>
1008
+ <tr>
1009
+ <td class="model-name-cell">Logistic Regression</td>
1010
+ <td>URL</td>
1011
+ <td>93.71%</td>
1012
+ <td>95.40%</td>
1013
+ <td>91.84%</td>
1014
+ <td>93.59%</td>
1015
+ <td>0.9789</td>
1016
+ <td>121</td>
1017
+ </tr>
1018
+ <tr>
1019
+ <td class="model-name-cell">Random Forest</td>
1020
+ <td>URL</td>
1021
+ <td>97.71%</td>
1022
+ <td>99.06%</td>
1023
+ <td>96.33%</td>
1024
+ <td>97.68%</td>
1025
+ <td>0.9958</td>
1026
+ <td>121</td>
1027
+ </tr>
1028
+ <tr>
1029
+ <td class="model-name-cell">XGBoost</td>
1030
+ <td>URL</td>
1031
+ <td>98.07%</td>
1032
+ <td>99.12%</td>
1033
+ <td>97.00%</td>
1034
+ <td>98.05%</td>
1035
+ <td>0.9963</td>
1036
+ <td>121</td>
1037
+ </tr>
1038
+ <tr>
1039
+ <td class="model-name-cell">Random Forest HTML</td>
1040
+ <td>HTML</td>
1041
+ <td>88.03%</td>
1042
+ <td>87.49%</td>
1043
+ <td>88.74%</td>
1044
+ <td>88.11%</td>
1045
+ <td>0.9561</td>
1046
+ <td>100</td>
1047
+ </tr>
1048
+ <tr>
1049
+ <td class="model-name-cell">XGBoost HTML</td>
1050
+ <td>HTML</td>
1051
+ <td>87.86%</td>
1052
+ <td>86.45%</td>
1053
+ <td>89.80%</td>
1054
+ <td>88.09%</td>
1055
+ <td>0.9557</td>
1056
+ <td>100</td>
1057
+ </tr>
1058
+ <tr>
1059
+ <td class="model-name-cell">RF Combined</td>
1060
+ <td>Combined</td>
1061
+ <td>98.60%</td>
1062
+ <td>99.16%</td>
1063
+ <td>98.02%</td>
1064
+ <td>98.59%</td>
1065
+ <td>0.9990</td>
1066
+ <td>221</td>
1067
+ </tr>
1068
+ <tr>
1069
+ <td class="model-name-cell">XGBoost Combined</td>
1070
+ <td>Combined</td>
1071
+ <td class="best">99.01%</td>
1072
+ <td class="best">99.35%</td>
1073
+ <td class="best">98.66%</td>
1074
+ <td class="best">99.01%</td>
1075
+ <td class="best">0.9991</td>
1076
+ <td>221</td>
1077
+ </tr>
1078
+ <tr>
1079
+ <td class="model-name-cell">CNN URL</td>
1080
+ <td>CNN</td>
1081
+ <td>98.38%</td>
1082
+ <td>98.88%</td>
1083
+ <td>97.86%</td>
1084
+ <td>98.37%</td>
1085
+ <td>0.9976</td>
1086
+ <td>chars</td>
1087
+ </tr>
1088
+ <tr>
1089
+ <td class="model-name-cell">CNN HTML</td>
1090
+ <td>CNN</td>
1091
+ <td>96.33%</td>
1092
+ <td>98.18%</td>
1093
+ <td>94.41%</td>
1094
+ <td>96.26%</td>
1095
+ <td>0.9908</td>
1096
+ <td>chars</td>
1097
+ </tr>
1098
+ </tbody>
1099
+ </table>
1100
+ </div>
1101
+
1102
+ <div class="section-title">Key Insights</div>
1103
+ <div class="insights-grid">
1104
+ <div class="insight-card insight-safe">
1105
+ <div class="insight-label">Best Overall</div>
1106
+ <div class="insight-title">XGBoost Combined</div>
1107
+ <div class="insight-desc">99.01% accuracy, 99.35% precision &mdash; best performance by combining 121 URL + 100 HTML features.</div>
1108
+ </div>
1109
+ <div class="insight-card insight-safe">
1110
+ <div class="insight-label">Ensemble Strength</div>
1111
+ <div class="insight-title">9-Model Consensus</div>
1112
+ <div class="insight-desc">Combining 3 URL + 2 HTML + 2 Combined + 2 CNN models via majority vote maximizes reliability.</div>
1113
+ </div>
1114
+ <div class="insight-card insight-accent">
1115
+ <div class="insight-label">Top Signal</div>
1116
+ <div class="insight-title">html_num_links</div>
1117
+ <div class="insight-desc">Number of links in HTML dominates XGBoost Combined at 44.2% importance &mdash; the single strongest feature.</div>
1118
+ </div>
1119
+ </div>
1120
+ </div>
1121
+ </section>
1122
+
1123
+ <footer>
1124
+ <div class="footer-text">Machine Learning Phishing Detection</div>
1125
+ </footer>
1126
+ </div>
1127
+
1128
+ <script src="/static/script.js?v=4"></script>
1129
+ </body>
1130
+ </html>
server/static/script.js ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ================================================================
2
+ Phishing Detection – UI Controller (Unified All-Models)
3
+ ================================================================ */
4
+
5
+ const API_BASE = window.location.origin;
6
+
7
+ // ── DOM Refs ────────────────────────────────────────────────────
8
+ const $url = () => document.getElementById('urlInput');
9
+ const $loading = () => document.getElementById('loading');
10
+ const $results = () => document.getElementById('results');
11
+
12
+ // ── Feature-key catalogues ──────────────────────────────────────
13
+ const TOP_URL_FEATURES = [
14
+ 'num_domain_parts', 'domain_dots', 'is_shortened', 'num_subdomains',
15
+ 'domain_hyphens', 'is_free_platform', 'platform_subdomain_length',
16
+ 'avg_domain_part_len', 'domain_length_category', 'path_digits', 'is_http',
17
+ 'multiple_brands_in_url', 'brand_in_path', 'path_slashes', 'encoding_diff',
18
+ 'symbol_ratio_domain', 'domain_length', 'has_at_symbol', 'tld_length',
19
+ 'is_free_hosting',
20
+ ];
21
+
22
+ const ALL_URL_FEATURES = [
23
+ 'url_length', 'domain_length', 'path_length', 'query_length', 'url_length_category',
24
+ 'domain_length_category', 'num_dots', 'num_hyphens', 'num_underscores', 'num_slashes',
25
+ 'num_question_marks', 'num_ampersands', 'num_equals', 'num_at', 'num_percent',
26
+ 'num_digits_url', 'num_letters_url', 'domain_dots', 'domain_hyphens', 'domain_digits',
27
+ 'path_slashes', 'path_dots', 'path_digits', 'digit_ratio_url', 'letter_ratio_url',
28
+ 'special_char_ratio', 'digit_ratio_domain', 'symbol_ratio_domain', 'num_subdomains',
29
+ 'num_domain_parts', 'tld_length', 'sld_length', 'longest_domain_part', 'avg_domain_part_len',
30
+ 'longest_part_gt_20', 'longest_part_gt_30', 'longest_part_gt_40', 'has_suspicious_tld',
31
+ 'has_trusted_tld', 'has_port', 'has_non_std_port', 'domain_randomness_score',
32
+ 'sld_consonant_cluster_score', 'sld_keyboard_pattern', 'sld_has_dictionary_word',
33
+ 'sld_pronounceability_score', 'domain_digit_position_suspicious', 'path_depth',
34
+ 'max_path_segment_len', 'avg_path_segment_len', 'has_extension', 'extension_category',
35
+ 'has_suspicious_extension', 'has_exe', 'has_double_slash', 'path_has_brand_not_domain',
36
+ 'path_has_ip_pattern', 'suspicious_path_extension_combo', 'num_params', 'has_query',
37
+ 'query_value_length', 'max_param_len', 'query_has_url', 'url_entropy', 'domain_entropy',
38
+ 'path_entropy', 'max_consecutive_digits', 'max_consecutive_chars', 'max_consecutive_consonants',
39
+ 'char_repeat_rate', 'unique_bigram_ratio', 'unique_trigram_ratio', 'sld_letter_diversity',
40
+ 'domain_has_numbers_letters', 'url_complexity_score', 'has_ip_address', 'has_at_symbol',
41
+ 'has_redirect', 'is_shortened', 'is_free_hosting', 'is_free_platform',
42
+ 'platform_subdomain_length', 'has_uuid_subdomain', 'is_http',
43
+ 'num_phishing_keywords', 'phishing_in_domain', 'phishing_in_path', 'num_brands',
44
+ 'brand_in_domain', 'brand_in_path', 'brand_impersonation', 'has_login', 'has_account',
45
+ 'has_verify', 'has_secure', 'has_update', 'has_bank', 'has_password', 'has_suspend',
46
+ 'has_webscr', 'has_cmd', 'has_cgi', 'brand_in_subdomain_not_domain', 'multiple_brands_in_url',
47
+ 'brand_with_hyphen', 'suspicious_brand_tld', 'brand_keyword_combo', 'has_url_encoding',
48
+ 'encoding_count', 'encoding_diff', 'has_punycode', 'has_unicode', 'has_hex_string',
49
+ 'has_base64', 'has_lookalike_chars', 'mixed_script_score', 'homograph_brand_risk',
50
+ 'suspected_idn_homograph', 'double_encoding', 'encoding_in_domain', 'suspicious_unicode_category',
51
+ ];
52
+
53
+ const TOP_HTML_FEATURES = [
54
+ 'has_login_form', 'num_password_fields', 'password_with_external_action',
55
+ 'num_external_form_actions', 'num_empty_form_actions', 'num_hidden_fields',
56
+ 'ratio_external_links', 'num_external_links', 'num_ip_based_links',
57
+ 'num_suspicious_tld_links', 'has_eval', 'has_base64', 'has_atob',
58
+ 'has_fromcharcode', 'has_document_write', 'has_right_click_disabled',
59
+ 'has_status_bar_customization', 'has_meta_refresh', 'has_location_replace',
60
+ 'num_hidden_iframes',
61
+ ];
62
+
63
+ const ALL_HTML_FEATURES = [
64
+ 'html_length', 'num_tags', 'num_divs', 'num_spans', 'num_paragraphs',
65
+ 'num_headings', 'num_lists', 'num_images', 'num_iframes', 'num_tables',
66
+ 'has_title', 'dom_depth',
67
+ 'num_forms', 'num_input_fields', 'num_password_fields', 'num_email_fields',
68
+ 'num_text_fields', 'num_submit_buttons', 'num_hidden_fields', 'has_login_form',
69
+ 'has_form', 'num_external_form_actions', 'num_empty_form_actions',
70
+ 'num_links', 'num_external_links', 'num_internal_links', 'num_empty_links',
71
+ 'num_mailto_links', 'num_javascript_links', 'ratio_external_links',
72
+ 'num_ip_based_links', 'num_suspicious_tld_links', 'num_anchor_text_mismatch',
73
+ 'num_scripts', 'num_inline_scripts', 'num_external_scripts',
74
+ 'has_eval', 'has_unescape', 'has_escape', 'has_document_write',
75
+ 'text_length', 'num_words', 'text_to_html_ratio', 'num_brand_mentions',
76
+ 'num_urgency_keywords', 'has_copyright', 'has_phone_number', 'has_email_address',
77
+ 'num_meta_tags', 'has_description', 'has_keywords', 'has_author',
78
+ 'has_viewport', 'has_meta_refresh',
79
+ 'num_css_files', 'num_external_css', 'num_external_images',
80
+ 'num_data_uri_images', 'num_inline_styles', 'inline_css_length', 'has_favicon',
81
+ 'password_with_external_action', 'has_base64', 'has_atob', 'has_fromcharcode',
82
+ 'num_onload_events', 'num_onerror_events', 'num_onclick_events',
83
+ 'num_unique_external_domains', 'num_forms_without_labels',
84
+ 'has_display_none', 'has_visibility_hidden', 'has_window_open',
85
+ 'has_location_replace', 'num_hidden_iframes', 'has_right_click_disabled',
86
+ 'has_status_bar_customization',
87
+ ];
88
+
89
+ // ── Highlight rules ─────────────────────────────────────────────
90
+
91
+ const GOOD_INDICATORS = new Set([
92
+ 'has_trusted_tld', 'has_title', 'has_favicon', 'sld_has_dictionary_word',
93
+ ]);
94
+
95
+ const BAD_INDICATORS = new Set([
96
+ 'is_shortened', 'is_free_hosting', 'is_free_platform',
97
+ 'has_ip_address', 'has_at_symbol', 'has_suspicious_tld',
98
+ 'has_meta_refresh', 'has_popup_window', 'form_action_external',
99
+ 'has_base64', 'brand_impersonation', 'has_punycode',
100
+ 'has_unicode', 'has_hex_string', 'suspected_idn_homograph',
101
+ 'is_http', 'multiple_brands_in_url', 'brand_in_path',
102
+ ]);
103
+
104
+ const DANGER_THRESHOLDS = {
105
+ num_password_fields: [0, '>'],
106
+ num_hidden_fields: [2, '>'],
107
+ num_urgency_keywords: [0, '>'],
108
+ num_phishing_keywords: [0, '>'],
109
+ num_external_scripts: [10, '>'],
110
+ platform_subdomain_length: [5, '>'],
111
+ domain_dots: [3, '>'],
112
+ num_subdomains: [3, '>'],
113
+ domain_entropy: [4.5, '>'],
114
+ symbol_ratio_domain: [0.3, '>'],
115
+ max_consecutive_digits: [5, '>'],
116
+ domain_hyphens: [1, '>'],
117
+ path_digits: [5, '>'],
118
+ encoding_diff: [0.5, '>'],
119
+ };
120
+
121
+ const SAFE_THRESHOLDS = {
122
+ domain_length: [15, '<'],
123
+ domain_entropy: [3.5, '<'],
124
+ num_brands: [1, '=='],
125
+ num_domain_parts: [2, '=='],
126
+ };
127
+
128
+ // ── API helpers ─────────────────────────────────────────────────
129
+
130
+ function normalizeUrl(raw) {
131
+ const trimmed = raw.trim();
132
+ if (!trimmed) return null;
133
+ return trimmed.startsWith('http://') || trimmed.startsWith('https://')
134
+ ? trimmed
135
+ : 'https://' + trimmed;
136
+ }
137
+
138
+ async function fetchPrediction(endpoint, body) {
139
+ const url = normalizeUrl($url().value);
140
+ if (!url) { alert('Please enter a URL'); return; }
141
+
142
+ showLoading();
143
+ try {
144
+ const res = await fetch(`${API_BASE}${endpoint}`, {
145
+ method: 'POST',
146
+ headers: { 'Content-Type': 'application/json' },
147
+ body: JSON.stringify(body(url)),
148
+ });
149
+ if (!res.ok) throw new Error('Analysis failed');
150
+ return await res.json();
151
+ } catch (err) {
152
+ alert('Error: ' + err.message);
153
+ hideLoading();
154
+ return null;
155
+ }
156
+ }
157
+
158
+ // ── Public actions ──────────────────────────────────────────────
159
+
160
+ async function analyzeAll() {
161
+ const data = await fetchPrediction('/api/predict/all', url => ({ url }));
162
+ if (data) displayAllResults(data);
163
+ }
164
+
165
+ function clearResults() {
166
+ const results = $results();
167
+ const input = $url();
168
+ if (results) results.style.display = 'none';
169
+ if (input) input.value = '';
170
+ }
171
+
172
+ // ── Ensemble weights (F1-score based) ───────────────────────────
173
+
174
+ const MODEL_WEIGHTS = {
175
+ 'Logistic Regression': 0.9359,
176
+ 'Random Forest': 0.9768,
177
+ 'XGBoost': 0.9805,
178
+ 'Random Forest HTML': 0.8811,
179
+ 'XGBoost HTML': 0.8809,
180
+ 'Random Forest Combined': 0.9859,
181
+ 'XGBoost Combined': 0.9901,
182
+ 'CNN URL (Char-level)': 0.9837,
183
+ 'CNN HTML (Char-level)': 0.9626,
184
+ };
185
+
186
+ function computeEnsembleVerdict(data) {
187
+ const allPredictions = [];
188
+ const categories = ['url_models', 'html_models', 'combined_models', 'cnn_models'];
189
+ categories.forEach(cat => {
190
+ const section = data[cat];
191
+ if (section && section.predictions) {
192
+ allPredictions.push(...section.predictions);
193
+ }
194
+ });
195
+
196
+ if (allPredictions.length === 0) {
197
+ return { score: 0, isPhishing: false, totalModels: 0, phishingVotes: 0 };
198
+ }
199
+
200
+ let weightedSum = 0;
201
+ let totalWeight = 0;
202
+ let phishingVotes = 0;
203
+
204
+ allPredictions.forEach(p => {
205
+ const w = MODEL_WEIGHTS[p.model_name] || 0.90;
206
+ const phishProb = p.phishing_probability / 100;
207
+ weightedSum += w * phishProb;
208
+ totalWeight += w;
209
+ if (p.prediction === 'PHISHING') phishingVotes++;
210
+ });
211
+
212
+ const score = totalWeight > 0 ? (weightedSum / totalWeight) * 100 : 0;
213
+
214
+ return {
215
+ score: Math.round(score * 10) / 10,
216
+ isPhishing: score >= 50,
217
+ totalModels: allPredictions.length,
218
+ phishingVotes,
219
+ };
220
+ }
221
+
222
+ // ── Loading UI ──────────────────────────────────────────────────
223
+
224
+ function showLoading() {
225
+ $loading().style.display = 'block';
226
+ $results().style.display = 'none';
227
+ }
228
+
229
+ function hideLoading() {
230
+ $loading().style.display = 'none';
231
+ }
232
+
233
+ // UNIFIED RESULTS
234
+ function displayAllResults(data) {
235
+ hideLoading();
236
+ const el = $results();
237
+ el.style.display = 'block';
238
+
239
+ // Weighted ensemble verdict
240
+ const verdict = computeEnsembleVerdict(data);
241
+ const statusClass = verdict.isPhishing ? 'danger' : 'safe';
242
+ const statusText = verdict.isPhishing ? 'Phishing' : 'Legitimate';
243
+ const safeVotes = verdict.totalModels - verdict.phishingVotes;
244
+
245
+ const banner = `
246
+ <div class="status-banner ${statusClass}">
247
+ <div class="status-headline">
248
+ <div>
249
+ <div class="status-title">${statusText}</div>
250
+ </div>
251
+ </div>
252
+ <div class="ensemble-score">
253
+ <div class="banner-score-value">${verdict.score.toFixed(1)}%</div>
254
+ <div class="banner-score-label">Phishing risk</div>
255
+ </div>
256
+ <div class="ensemble-bar">
257
+ <div class="prob-fill ${statusClass}" style="width:${verdict.score}%"></div>
258
+ </div>
259
+ <div class="status-votes">${verdict.phishingVotes}/${verdict.totalModels} models flagged phishing \u00b7 ${safeVotes}/${verdict.totalModels} say legitimate</div>
260
+ </div>
261
+ <div class="url-display">${data.url}</div>`;
262
+
263
+ // Build tabs
264
+ const tabs = [];
265
+ const tabContents = [];
266
+
267
+ // Tab 1: URL Models
268
+ if (data.url_models) {
269
+ tabs.push({ id: 'tabUrl', label: 'URL Models', count: data.url_models.predictions?.length || 0 });
270
+ tabContents.push({ id: 'tabUrl', html: renderUrlModelsTab(data.url_models) });
271
+ }
272
+
273
+ // Tab 2: HTML Models
274
+ if (data.html_models) {
275
+ tabs.push({ id: 'tabHtml', label: 'HTML Models', count: data.html_models.predictions?.length || 0 });
276
+ tabContents.push({ id: 'tabHtml', html: renderHtmlModelsTab(data.html_models) });
277
+ } else if (data.html_error) {
278
+ tabs.push({ id: 'tabHtml', label: 'HTML Models', count: 0 });
279
+ tabContents.push({ id: 'tabHtml', html: `<div class="error-notice">HTML download failed: ${data.html_error}</div>` });
280
+ }
281
+
282
+ // Tab 3: Combined Models
283
+ if (data.combined_models) {
284
+ tabs.push({ id: 'tabCombined', label: 'Combined Models', count: data.combined_models.predictions?.length || 0 });
285
+ tabContents.push({ id: 'tabCombined', html: renderCombinedModelsTab(data.combined_models) });
286
+ }
287
+
288
+ // Tab 4: CNN Models
289
+ if (data.cnn_models) {
290
+ tabs.push({ id: 'tabCnn', label: 'CNN Models', count: data.cnn_models.predictions?.length || 0 });
291
+ tabContents.push({ id: 'tabCnn', html: renderCnnModelsTab(data.cnn_models) });
292
+ }
293
+
294
+ const tabsHTML = tabs.map((t, i) => `
295
+ <button class="tab ${i === 0 ? 'active' : ''}" onclick="switchTab(event,'${t.id}')">
296
+ ${t.label} <span class="tab-count">${t.count}</span>
297
+ </button>
298
+ `).join('');
299
+
300
+ const contentsHTML = tabContents.map((t, i) => `
301
+ <div id="${t.id}" class="tab-content ${i === 0 ? 'active' : ''}">${t.html}</div>
302
+ `).join('');
303
+
304
+ el.innerHTML = `${banner}
305
+ <div class="tabs">${tabsHTML}</div>
306
+ ${contentsHTML}`;
307
+ }
308
+
309
+ // TAB RENDERERS
310
+ function renderUrlModelsTab(urlData) {
311
+ const predictions = urlData.predictions || [];
312
+ const features = urlData.features || {};
313
+
314
+ return `
315
+ <div class="section-title">Model Predictions</div>
316
+ <div class="models-grid">${predictions.map(p => renderModelCard(p)).join('')}</div>
317
+ ${renderFeatureSection(features, 'url')}
318
+ `;
319
+ }
320
+
321
+ function renderHtmlModelsTab(htmlData) {
322
+ const predictions = htmlData.predictions || [];
323
+ const features = htmlData.features || {};
324
+
325
+ return `
326
+ <div class="section-title">Model Predictions</div>
327
+ <div class="models-grid">${predictions.map(p => renderModelCard(p)).join('')}</div>
328
+ ${renderFeatureSection(features, 'html')}
329
+ `;
330
+ }
331
+
332
+ function renderCombinedModelsTab(combinedData) {
333
+ const predictions = combinedData.predictions || [];
334
+ const urlFeats = combinedData.url_features || {};
335
+ const htmlFeats = combinedData.html_features || {};
336
+ const hasHtmlF = Object.keys(htmlFeats).length > 0;
337
+
338
+ return `
339
+ <div class="section-title">Model Predictions</div>
340
+ <div class="models-grid">${predictions.map(p => renderModelCard(p)).join('')}</div>
341
+ <div class="combined-features-tabs">
342
+ <div class="tabs">
343
+ <button class="tab active" onclick="switchSubTab(event,'combinedUrlFeats')">URL Features</button>
344
+ <button class="tab" onclick="switchSubTab(event,'combinedHtmlFeats')">HTML Features</button>
345
+ </div>
346
+ <div id="combinedUrlFeats" class="tab-content active">
347
+ ${renderFeatureSection(urlFeats, 'combined-url')}
348
+ </div>
349
+ <div id="combinedHtmlFeats" class="tab-content">
350
+ ${hasHtmlF
351
+ ? renderFeatureSection(htmlFeats, 'combined-html')
352
+ : `<div class="error-notice">HTML features unavailable${combinedData.html_error ? ': ' + combinedData.html_error : ''}</div>`}
353
+ </div>
354
+ </div>
355
+ `;
356
+ }
357
+
358
+ function renderCnnModelsTab(cnnData) {
359
+ const predictions = cnnData.predictions || [];
360
+
361
+ return `
362
+ <div class="section-title">Model Predictions</div>
363
+ <div class="models-grid">${predictions.map(p => renderModelCard(p)).join('')}</div>
364
+ `;
365
+ }
366
+
367
+ // MODEL CARDS & INFO
368
+ function renderModelCard(pred) {
369
+ const isSafe = pred.prediction.toLowerCase() === 'legitimate';
370
+ const cls = isSafe ? 'safe' : 'danger';
371
+ return `
372
+ <div class="model-card ${cls}">
373
+ <div class="model-header">
374
+ <div class="model-name">${pred.model_name}</div>
375
+ <div class="model-prediction ${cls}">${pred.prediction}</div>
376
+ </div>
377
+ <div class="model-confidence">${pred.confidence.toFixed(1)}%</div>
378
+ <div class="model-confidence-label">Confidence</div>
379
+ <div class="prob-container">
380
+ ${probRow('Safe', pred.legitimate_probability, 'safe')}
381
+ ${probRow('Phishing', pred.phishing_probability, 'danger')}
382
+ </div>
383
+ </div>`;
384
+ }
385
+
386
+ function probRow(label, pct, cls) {
387
+ return `
388
+ <div class="prob-row">
389
+ <span class="prob-label">${label}</span>
390
+ <div class="prob-bar"><div class="prob-fill ${cls}" style="width:${pct}%"></div></div>
391
+ <span class="prob-value">${pct.toFixed(0)}%</span>
392
+ </div>`;
393
+ }
394
+
395
+
396
+
397
+ // FEATURE RENDERING
398
+ function renderFeatureSection(features, tag) {
399
+ if (!features || Object.keys(features).length === 0) return '';
400
+
401
+ const isHtml = 'num_forms' in features || 'html_length' in features;
402
+ const topKeys = isHtml ? TOP_HTML_FEATURES : TOP_URL_FEATURES;
403
+ const allKeys = isHtml ? ALL_HTML_FEATURES : ALL_URL_FEATURES;
404
+ const remaining = allKeys.filter(k => !topKeys.includes(k));
405
+
406
+ const topHTML = renderFeatureList(topKeys, features);
407
+ const remainingHTML = renderFeatureList(remaining, features);
408
+
409
+ return `
410
+ <div class="section-title">Extracted Features (Top 20)</div>
411
+ <div class="features-grid">
412
+ ${topHTML}
413
+ <div id="hiddenFeatures-${tag}" class="features-hidden">${remainingHTML}</div>
414
+ </div>
415
+ <button class="show-more-btn" onclick="toggleAllFeatures('${tag}')" id="showMoreBtn-${tag}">
416
+ Show All Features (${Object.keys(features).length})
417
+ </button>`;
418
+ }
419
+
420
+ function renderFeatureList(keys, features) {
421
+ return keys.filter(k => k in features).map(k => renderFeature(k, features[k])).join('');
422
+ }
423
+
424
+ function renderFeature(key, value) {
425
+ let itemClass = '';
426
+ let valueClass = '';
427
+ const isBool = typeof value === 'boolean' || value === 0 || value === 1;
428
+ const boolVal = value === true || value === 1;
429
+
430
+ if (isBool) {
431
+ if (GOOD_INDICATORS.has(key)) {
432
+ valueClass = boolVal ? 'true' : 'false';
433
+ itemClass = boolVal ? 'highlight-safe' : 'highlight-danger';
434
+ } else if (BAD_INDICATORS.has(key)) {
435
+ valueClass = boolVal ? 'false' : 'true';
436
+ itemClass = boolVal ? 'highlight-danger' : 'highlight-safe';
437
+ }
438
+ }
439
+
440
+ if (key in DANGER_THRESHOLDS) {
441
+ const [thr, op] = DANGER_THRESHOLDS[key];
442
+ if ((op === '>' && value > thr) || (op === '>=' && value >= thr)) {
443
+ itemClass = 'highlight-danger';
444
+ }
445
+ }
446
+
447
+ if (key in SAFE_THRESHOLDS) {
448
+ const [thr, op] = SAFE_THRESHOLDS[key];
449
+ if ((op === '<' && value < thr) || (op === '==' && value === thr)) {
450
+ itemClass = 'highlight-safe';
451
+ }
452
+ }
453
+
454
+ return `
455
+ <div class="feature-item ${itemClass}">
456
+ <span class="feature-label">${formatName(key)}</span>
457
+ <span class="feature-value ${valueClass}">${formatValue(value)}</span>
458
+ </div>`;
459
+ }
460
+
461
+ function switchTab(event, tabId) {
462
+ const parent = event.currentTarget.closest('.tabs')?.parentElement ?? document;
463
+ parent.querySelectorAll('.tabs > .tab').forEach(t => t.classList.remove('active'));
464
+ parent.querySelectorAll(':scope > .tab-content').forEach(c => c.classList.remove('active'));
465
+ event.currentTarget.classList.add('active');
466
+ document.getElementById(tabId).classList.add('active');
467
+ }
468
+
469
+ function switchSubTab(event, tabId) {
470
+ const parent = event.currentTarget.closest('.combined-features-tabs');
471
+ parent.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
472
+ parent.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
473
+ event.currentTarget.classList.add('active');
474
+ document.getElementById(tabId).classList.add('active');
475
+ }
476
+
477
+ function toggleFeatures(el) {
478
+ const content = el.nextElementSibling;
479
+ const icon = el.querySelector('.toggle-icon');
480
+ const isOpen = content.classList.toggle('open');
481
+ icon.textContent = isOpen ? '\u2212' : '+';
482
+ }
483
+
484
+ function toggleAllFeatures(type) {
485
+ const hidden = document.getElementById('hiddenFeatures-' + type);
486
+ const btn = document.getElementById('showMoreBtn-' + type);
487
+ if (hidden.classList.toggle('features-hidden')) {
488
+ const total = hidden.closest('.features-grid')?.querySelectorAll('.feature-item').length ?? 0;
489
+ btn.textContent = `Show All Features (${total})`;
490
+ } else {
491
+ btn.textContent = 'Show Less';
492
+ }
493
+ }
494
+
495
+ function formatName(name) {
496
+ return name.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase());
497
+ }
498
+
499
+ function formatValue(value) {
500
+ if (typeof value === 'boolean') return value ? 'Yes' : 'No';
501
+ if (value === 0 || value === 1) return value === 1 ? 'Yes' : 'No';
502
+ if (typeof value === 'number') return value % 1 === 0 ? value : value.toFixed(2);
503
+ return value;
504
+ }
505
+
506
+ document.addEventListener('DOMContentLoaded', () => {
507
+ const input = $url();
508
+ if (input) input.addEventListener('keypress', e => { if (e.key === 'Enter') analyzeAll(); });
509
+ });
server/static/style.css ADDED
@@ -0,0 +1,1325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ * {
2
+ margin: 0;
3
+ padding: 0;
4
+ box-sizing: border-box;
5
+ }
6
+
7
+ html, body {
8
+ scrollbar-width: none;
9
+ -ms-overflow-style: none;
10
+ }
11
+
12
+ html::-webkit-scrollbar,
13
+ body::-webkit-scrollbar {
14
+ display: none;
15
+ }
16
+
17
+ :root {
18
+ --bg: #ffffff;
19
+ --bg-secondary: #f5f5f5;
20
+ --text: #000000;
21
+ --text-secondary: #666666;
22
+ --border: #e0e0e0;
23
+ --safe: #00a86b;
24
+ --safe-bg: #e8f5e9;
25
+ --danger: #dc2626;
26
+ --danger-bg: #fef2f2;
27
+ --accent: #000000;
28
+ }
29
+
30
+ /*
31
+ * DESIGN SYSTEM REFERENCE
32
+ * ───────────────────────────────────────────────
33
+ *
34
+ * Typography:
35
+ * Label: 10px / 600 / 0.15em / uppercase / --text-secondary
36
+ * Button: 11px / 600 / 0.1em / uppercase
37
+ * Body: 12-13px / 400 / --text or --text-secondary
38
+ * Mono: 'SF Mono', 'Monaco', 'Inconsolata', monospace
39
+ * Heading: 16-28px / 700
40
+ *
41
+ * Spacing: 8, 12, 16, 20, 24, 32, 40, 48px
42
+ * Borders: 1px solid var(--border)
43
+ * Accents: border-left 3-4px solid var(--safe | --danger | --accent)
44
+ *
45
+ * Color modifiers (add to element):
46
+ * .safe → green state (--safe, --safe-bg)
47
+ * .danger → red state (--danger, --danger-bg)
48
+ *
49
+ * Reusable components:
50
+ * .btn filled button
51
+ * .btn-secondary ghost button (use with .btn)
52
+ * .btn-outline standalone outline link/button
53
+ * .tabs / .tab tab navigation (nest for sub-tabs)
54
+ * .tab-count badge inside .tab
55
+ * .tab-content tab panel (add .active)
56
+ * .section-title section header label with border
57
+ * .status-banner verdict banner (add .safe/.danger)
58
+ * .model-card prediction card (add .safe/.danger)
59
+ * .prob-bar/.prob-fill 4px progress bar (add .safe/.danger to fill)
60
+ * .feature-item key-value pair row
61
+ * .features-grid auto-fill grid for feature items
62
+ * .models-grid auto-fit grid for cards
63
+ * .error-notice error message block
64
+ */
65
+
66
+ body {
67
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
68
+ background: var(--bg);
69
+ color: var(--text);
70
+ min-height: 100vh;
71
+ line-height: 1.4;
72
+ font-weight: 400;
73
+ -webkit-font-smoothing: antialiased;
74
+ }
75
+
76
+ .container {
77
+ max-width: 960px;
78
+ margin: 0 auto;
79
+ padding: 0 24px;
80
+ }
81
+
82
+ /* HEADER */
83
+ header {
84
+ padding: 48px 0 40px;
85
+ border-bottom: 1px solid var(--border);
86
+ }
87
+
88
+ .logo {
89
+ font-size: 11px;
90
+ font-weight: 700;
91
+ letter-spacing: 0.2em;
92
+ text-transform: uppercase;
93
+ margin-bottom: 4px;
94
+ }
95
+
96
+ .tagline {
97
+ font-size: 11px;
98
+ color: var(--text-secondary);
99
+ letter-spacing: 0.05em;
100
+ }
101
+
102
+ /* INPUT SECTION */
103
+ .input-section {
104
+ padding: 48px 0;
105
+ border-bottom: 1px solid var(--border);
106
+ }
107
+
108
+ .input-label {
109
+ font-size: 10px;
110
+ font-weight: 600;
111
+ letter-spacing: 0.15em;
112
+ text-transform: uppercase;
113
+ color: var(--text-secondary);
114
+ margin-bottom: 12px;
115
+ display: block;
116
+ }
117
+
118
+ .input-wrapper {
119
+ display: flex;
120
+ gap: 0;
121
+ margin-bottom: 16px;
122
+ }
123
+
124
+ input[type="text"] {
125
+ flex: 1;
126
+ padding: 16px 20px;
127
+ border: 1px solid var(--border);
128
+ border-right: none;
129
+ font-size: 14px;
130
+ font-family: inherit;
131
+ background: var(--bg);
132
+ color: var(--text);
133
+ outline: none;
134
+ transition: border-color 0.2s;
135
+ }
136
+
137
+ input[type="text"]:focus {
138
+ border-color: var(--text);
139
+ }
140
+
141
+ input[type="text"]::placeholder {
142
+ color: #999;
143
+ }
144
+
145
+ .btn {
146
+ padding: 16px 32px;
147
+ border: 1px solid var(--text);
148
+ background: var(--text);
149
+ color: var(--bg);
150
+ font-size: 11px;
151
+ font-weight: 600;
152
+ letter-spacing: 0.1em;
153
+ text-transform: uppercase;
154
+ cursor: pointer;
155
+ font-family: inherit;
156
+ transition: all 0.2s;
157
+ white-space: nowrap;
158
+ }
159
+
160
+ .btn:hover {
161
+ background: #333;
162
+ }
163
+
164
+ .btn:active {
165
+ transform: scale(0.98);
166
+ }
167
+
168
+ .btn-group {
169
+ display: flex;
170
+ gap: 8px;
171
+ }
172
+
173
+ .btn-secondary {
174
+ background: var(--bg);
175
+ color: var(--text);
176
+ }
177
+
178
+ .btn-secondary:hover {
179
+ background: var(--bg-secondary);
180
+ }
181
+
182
+ /* OUTLINE BUTTON — standalone outline link or button */
183
+ .btn-outline,
184
+ .learn-more-btn,
185
+ .show-more-btn,
186
+ .back-link {
187
+ display: inline-block;
188
+ padding: 10px 20px;
189
+ border: 1px solid var(--border);
190
+ background: transparent;
191
+ color: var(--text);
192
+ font-size: 11px;
193
+ font-weight: 600;
194
+ letter-spacing: 0.1em;
195
+ text-transform: uppercase;
196
+ text-decoration: none;
197
+ font-family: inherit;
198
+ cursor: pointer;
199
+ transition: all 0.2s;
200
+ }
201
+
202
+ .btn-outline:hover,
203
+ .learn-more-btn:hover,
204
+ .show-more-btn:hover,
205
+ .back-link:hover {
206
+ border-color: var(--text);
207
+ background: var(--bg-secondary);
208
+ }
209
+
210
+ /* LOADING */
211
+ .loading {
212
+ display: none;
213
+ padding: 80px 0;
214
+ text-align: center;
215
+ }
216
+
217
+ .loading-bar {
218
+ width: 48px;
219
+ height: 2px;
220
+ background: var(--border);
221
+ margin: 0 auto 16px;
222
+ position: relative;
223
+ overflow: hidden;
224
+ }
225
+
226
+ .loading-bar::after {
227
+ content: '';
228
+ position: absolute;
229
+ left: -50%;
230
+ width: 50%;
231
+ height: 100%;
232
+ background: var(--text);
233
+ animation: loading 1s ease-in-out infinite;
234
+ }
235
+
236
+ @keyframes loading {
237
+ 0% { left: -50%; }
238
+ 100% { left: 100%; }
239
+ }
240
+
241
+ .loading-text {
242
+ font-size: 10px;
243
+ letter-spacing: 0.15em;
244
+ text-transform: uppercase;
245
+ color: var(--text-secondary);
246
+ }
247
+
248
+ /* RESULTS */
249
+ .results {
250
+ display: none;
251
+ padding: 48px 0;
252
+ }
253
+
254
+ /* STATUS BANNER */
255
+ .status-banner {
256
+ padding: 32px;
257
+ margin-bottom: 32px;
258
+ text-align: center;
259
+ }
260
+
261
+ .status-banner.safe {
262
+ background: var(--safe-bg);
263
+ border-left: 4px solid var(--safe);
264
+ }
265
+
266
+ .status-banner.danger {
267
+ background: var(--danger-bg);
268
+ border-left: 4px solid var(--danger);
269
+ }
270
+
271
+ .status-icon {
272
+ font-size: 30px;
273
+ line-height: 1;
274
+ }
275
+
276
+ .status-banner.safe .status-icon {
277
+ color: var(--safe);
278
+ }
279
+
280
+ .status-banner.danger .status-icon {
281
+ color: var(--danger);
282
+ }
283
+
284
+ .status-title {
285
+ font-size: 28px;
286
+ font-weight: 700;
287
+ letter-spacing: 0.08em;
288
+ text-transform: uppercase;
289
+ }
290
+
291
+ .status-banner.safe .status-title {
292
+ color: var(--safe);
293
+ }
294
+
295
+ .status-banner.danger .status-title {
296
+ color: var(--danger);
297
+ }
298
+
299
+ .status-subtitle {
300
+ font-size: 11px;
301
+ color: var(--text-secondary);
302
+ letter-spacing: 0.08em;
303
+ text-transform: uppercase;
304
+ margin-top: 4px;
305
+ }
306
+
307
+ /* ENSEMBLE SCORE */
308
+ .ensemble-score {
309
+ display: flex;
310
+ align-items: baseline;
311
+ justify-content: center;
312
+ gap: 4px;
313
+ margin: 4px;
314
+ }
315
+
316
+ .status-banner.safe .model-confidence {
317
+ color: var(--safe);
318
+ }
319
+
320
+ .status-banner.danger .model-confidence {
321
+ color: var(--danger);
322
+ }
323
+
324
+ .ensemble-bar {
325
+ width: 100%;
326
+ max-width: 360px;
327
+ height: 4px;
328
+ background: var(--border);
329
+ margin: 10px auto 8px;
330
+ }
331
+
332
+ /* ensemble-bar uses .prob-fill for the fill element */
333
+
334
+ .status-kicker {
335
+ font-size: 10px;
336
+ font-weight: 600;
337
+ letter-spacing: 0.14em;
338
+ text-transform: uppercase;
339
+ color: var(--text-secondary);
340
+ margin-bottom: 10px;
341
+ }
342
+
343
+ .status-headline {
344
+ display: inline-flex;
345
+ align-items: center;
346
+ gap: 12px;
347
+ }
348
+
349
+ .status-headline > div:last-child {
350
+ text-align: left;
351
+ }
352
+
353
+ .banner-score-label {
354
+ font-size: 10px;
355
+ font-weight: 600;
356
+ letter-spacing: 0.1em;
357
+ text-transform: uppercase;
358
+ color: var(--text-secondary);
359
+ }
360
+
361
+ .banner-score-value {
362
+ font-size: 10px;
363
+ font-weight: 700;
364
+ letter-spacing: 0.04em;
365
+ }
366
+
367
+ .status-banner.safe .banner-score-value {
368
+ color: var(--safe);
369
+ }
370
+
371
+ .status-banner.danger .banner-score-value {
372
+ color: var(--danger);
373
+ }
374
+
375
+ .banner-score-note {
376
+ font-size: 10px;
377
+ color: var(--text-secondary);
378
+ letter-spacing: 0.08em;
379
+ text-transform: uppercase;
380
+ margin-bottom: 10px;
381
+ }
382
+
383
+ .status-votes {
384
+ font-size: 10px;
385
+ color: var(--text-secondary);
386
+ letter-spacing: 0.08em;
387
+ text-transform: uppercase;
388
+ }
389
+
390
+ /* URL DISPLAY */
391
+ .url-display {
392
+ padding: 16px 20px;
393
+ background: var(--bg-secondary);
394
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
395
+ font-size: 13px;
396
+ word-break: break-all;
397
+ margin-bottom: 32px;
398
+ border-left: 2px solid var(--border);
399
+ }
400
+
401
+ /* SECTION TITLES */
402
+ .section-title {
403
+ font-size: 10px;
404
+ font-weight: 600;
405
+ letter-spacing: 0.15em;
406
+ text-transform: uppercase;
407
+ color: var(--text-secondary);
408
+ margin-bottom: 16px;
409
+ padding-bottom: 8px;
410
+ border-bottom: 1px solid var(--border);
411
+ }
412
+
413
+ /* TABS */
414
+ .tabs {
415
+ display: flex;
416
+ gap: 0;
417
+ margin-bottom: 32px;
418
+ border-bottom: 1px solid var(--border);
419
+ }
420
+
421
+ .tab {
422
+ padding: 12px 24px;
423
+ font-size: 11px;
424
+ font-weight: 600;
425
+ letter-spacing: 0.1em;
426
+ text-transform: uppercase;
427
+ background: none;
428
+ border: none;
429
+ border-bottom: 2px solid transparent;
430
+ cursor: pointer;
431
+ color: var(--text-secondary);
432
+ transition: all 0.2s;
433
+ font-family: inherit;
434
+ margin-bottom: -1px;
435
+ }
436
+
437
+ .tab:hover {
438
+ color: var(--text);
439
+ }
440
+
441
+ .tab.active {
442
+ color: var(--text);
443
+ border-bottom-color: var(--text);
444
+ }
445
+
446
+ .tab-content {
447
+ display: none;
448
+ }
449
+
450
+ .tab-content.active {
451
+ display: block;
452
+ }
453
+
454
+ /* MODEL CARDS */
455
+ .models-grid {
456
+ display: grid;
457
+ grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
458
+ gap: 16px;
459
+ margin-bottom: 40px;
460
+ }
461
+
462
+ .model-card {
463
+ padding: 24px;
464
+ border: 1px solid var(--border);
465
+ background: var(--bg);
466
+ }
467
+
468
+ .model-card.safe {
469
+ border-left: 3px solid var(--safe);
470
+ }
471
+
472
+ .model-card.danger {
473
+ border-left: 3px solid var(--danger);
474
+ }
475
+
476
+ .model-header {
477
+ display: flex;
478
+ justify-content: space-between;
479
+ align-items: flex-start;
480
+ margin-bottom: 16px;
481
+ }
482
+
483
+ .model-name {
484
+ font-size: 11px;
485
+ font-weight: 600;
486
+ letter-spacing: 0.1em;
487
+ text-transform: uppercase;
488
+ }
489
+
490
+ .model-prediction {
491
+ font-size: 11px;
492
+ font-weight: 700;
493
+ letter-spacing: 0.05em;
494
+ text-transform: uppercase;
495
+ padding: 4px 8px;
496
+ }
497
+
498
+ .model-prediction.safe {
499
+ color: var(--safe);
500
+ background: var(--safe-bg);
501
+ }
502
+
503
+ .model-prediction.danger {
504
+ color: var(--danger);
505
+ background: var(--danger-bg);
506
+ }
507
+
508
+ .model-confidence {
509
+ font-size: 10px;
510
+ font-weight: 700;
511
+ margin-bottom: 8px;
512
+ }
513
+
514
+ .model-confidence-label {
515
+ font-size: 10px;
516
+ color: var(--text-secondary);
517
+ letter-spacing: 0.1em;
518
+ text-transform: uppercase;
519
+ }
520
+
521
+ /* PROBABILITY BAR */
522
+ .prob-container {
523
+ margin-top: 16px;
524
+ }
525
+
526
+ .prob-row {
527
+ display: flex;
528
+ align-items: center;
529
+ gap: 12px;
530
+ margin-bottom: 8px;
531
+ }
532
+
533
+ .prob-label {
534
+ font-size: 10px;
535
+ text-transform: uppercase;
536
+ letter-spacing: 0.05em;
537
+ width: 70px;
538
+ color: var(--text-secondary);
539
+ }
540
+
541
+ .prob-bar {
542
+ flex: 1;
543
+ height: 4px;
544
+ background: var(--bg-secondary);
545
+ position: relative;
546
+ }
547
+
548
+ .prob-fill {
549
+ height: 100%;
550
+ transition: width 0.5s ease;
551
+ }
552
+
553
+ .prob-fill.safe {
554
+ background: var(--safe);
555
+ }
556
+
557
+ .prob-fill.danger {
558
+ background: var(--danger);
559
+ }
560
+
561
+ .prob-value {
562
+ font-size: 11px;
563
+ font-weight: 600;
564
+ width: 45px;
565
+ text-align: right;
566
+ }
567
+
568
+ /* FEATURES GRID */
569
+ .features-grid {
570
+ display: grid;
571
+ grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
572
+ gap: 8px;
573
+ margin-bottom: 32px;
574
+ }
575
+
576
+ .feature-item {
577
+ display: flex;
578
+ justify-content: space-between;
579
+ align-items: center;
580
+ padding: 12px 16px;
581
+ background: var(--bg-secondary);
582
+ border-left: 2px solid var(--border);
583
+ font-size: 12px;
584
+ transition: border-color 0.2s;
585
+ }
586
+
587
+ .feature-item:hover {
588
+ border-left-color: var(--text);
589
+ }
590
+
591
+ .feature-item.highlight-safe {
592
+ border-left-color: var(--safe);
593
+ background: var(--safe-bg);
594
+ }
595
+
596
+ .feature-item.highlight-danger {
597
+ border-left-color: var(--danger);
598
+ background: var(--danger-bg);
599
+ }
600
+
601
+ .feature-label {
602
+ color: var(--text-secondary);
603
+ font-size: 11px;
604
+ }
605
+
606
+ .feature-value {
607
+ font-weight: 600;
608
+ font-family: 'SF Mono', 'Monaco', monospace;
609
+ font-size: 12px;
610
+ }
611
+
612
+ .feature-value.true {
613
+ color: var(--safe);
614
+ }
615
+
616
+ .feature-value.false {
617
+ color: var(--danger);
618
+ }
619
+
620
+ .show-more-btn { width: 100%; margin-top: 16px; }
621
+
622
+ [id^="hiddenFeatures-"] {
623
+ display: contents;
624
+ }
625
+
626
+ [id^="hiddenFeatures-"].features-hidden {
627
+ display: none;
628
+ }
629
+
630
+ /* FOOTER */
631
+ footer {
632
+ padding: 32px 0;
633
+ border-top: 1px solid var(--border);
634
+ text-align: center;
635
+ }
636
+
637
+ .footer-text {
638
+ font-size: 10px;
639
+ color: var(--text-secondary);
640
+ letter-spacing: 0.1em;
641
+ text-transform: uppercase;
642
+ }
643
+
644
+ .learn-more-btn { margin-top: 16px; padding: 12px 28px; }
645
+
646
+ /* TAB COUNT BADGE */
647
+ .tab-count {
648
+ display: inline-block;
649
+ background: var(--bg-secondary);
650
+ border: 1px solid var(--border);
651
+ font-size: 10px;
652
+ font-weight: 700;
653
+ padding: 1px 6px;
654
+ margin-left: 4px;
655
+ vertical-align: middle;
656
+ }
657
+
658
+ .tab.active .tab-count {
659
+ background: var(--text);
660
+ color: var(--bg);
661
+ border-color: var(--text);
662
+ }
663
+
664
+
665
+
666
+ /* FEATURE SPLIT BAR */
667
+ .feature-split {
668
+ margin-top: 16px;
669
+ }
670
+
671
+ .feature-split-bar {
672
+ display: flex;
673
+ height: 32px;
674
+ overflow: hidden;
675
+ border: 1px solid var(--border);
676
+ }
677
+
678
+ .split-url {
679
+ background: var(--bg-secondary);
680
+ display: flex;
681
+ align-items: center;
682
+ justify-content: center;
683
+ font-size: 10px;
684
+ font-weight: 600;
685
+ letter-spacing: 0.05em;
686
+ text-transform: uppercase;
687
+ border-right: 1px solid var(--border);
688
+ }
689
+
690
+ .split-html {
691
+ background: var(--text);
692
+ color: var(--bg);
693
+ display: flex;
694
+ align-items: center;
695
+ justify-content: center;
696
+ font-size: 10px;
697
+ font-weight: 600;
698
+ letter-spacing: 0.05em;
699
+ text-transform: uppercase;
700
+ }
701
+
702
+ /* COMBINED FEATURES TABS */
703
+ .combined-features-tabs {
704
+ margin-top: 32px;
705
+ }
706
+
707
+ /* ERROR NOTICE */
708
+ .error-notice {
709
+ font-size: 12px;
710
+ color: var(--danger);
711
+ padding: 16px 20px;
712
+ background: var(--danger-bg);
713
+ border-left: 2px solid var(--danger);
714
+ margin: 16px 0;
715
+ }
716
+
717
+ /* RESPONSIVE */
718
+ @media (max-width: 640px) {
719
+ header {
720
+ padding: 32px 0;
721
+ }
722
+
723
+ .input-section {
724
+ padding: 32px 0;
725
+ }
726
+
727
+ .input-wrapper {
728
+ flex-direction: column;
729
+ }
730
+
731
+ input[type="text"] {
732
+ border-right: 1px solid var(--border);
733
+ border-bottom: none;
734
+ }
735
+
736
+ .btn {
737
+ width: 100%;
738
+ }
739
+
740
+ .btn-group {
741
+ flex-direction: column;
742
+ }
743
+
744
+ .models-grid {
745
+ grid-template-columns: 1fr;
746
+ }
747
+
748
+ .status-title {
749
+ font-size: 22px;
750
+ }
751
+
752
+ .status-headline {
753
+ gap: 10px;
754
+ }
755
+
756
+ .status-icon {
757
+ font-size: 26px;
758
+ }
759
+
760
+ .banner-score-value {
761
+ font-size: 14px;
762
+ }
763
+ }
764
+
765
+ /* =============================================
766
+ MODELS PAGE
767
+ ============================================= */
768
+
769
+ /* MODELS PAGE - HEADER */
770
+ .models-page header {
771
+ display: flex;
772
+ justify-content: space-between;
773
+ align-items: flex-end;
774
+ }
775
+
776
+ .header-left {
777
+ display: flex;
778
+ flex-direction: column;
779
+ }
780
+
781
+ .logo a {
782
+ color: var(--text);
783
+ text-decoration: none;
784
+ }
785
+
786
+ .logo a:hover {
787
+ opacity: 0.7;
788
+ }
789
+
790
+ .back-link { padding: 8px 16px; }
791
+
792
+ /* PAGE TITLE */
793
+ .page-title-section {
794
+ padding: 48px 0;
795
+ border-bottom: 1px solid var(--border);
796
+ }
797
+
798
+ .page-title {
799
+ font-size: 28px;
800
+ font-weight: 700;
801
+ letter-spacing: 0.02em;
802
+ margin-bottom: 8px;
803
+ }
804
+
805
+ .page-description {
806
+ font-size: 13px;
807
+ color: var(--text-secondary);
808
+ line-height: 1.6;
809
+ max-width: 640px;
810
+ }
811
+
812
+ /* SECTION */
813
+ .section {
814
+ padding: 40px 0;
815
+ border-bottom: 1px solid var(--border);
816
+ }
817
+
818
+ .section:last-child {
819
+ border-bottom: none;
820
+ }
821
+
822
+ .section-subtitle {
823
+ font-size: 13px;
824
+ color: var(--text-secondary);
825
+ margin-bottom: 24px;
826
+ line-height: 1.5;
827
+ }
828
+
829
+ /* COMPARISON TABLE */
830
+ .comparison-table {
831
+ width: 100%;
832
+ border-collapse: collapse;
833
+ font-size: 13px;
834
+ margin-bottom: 24px;
835
+ }
836
+
837
+ .comparison-table th {
838
+ text-align: left;
839
+ padding: 12px 16px;
840
+ font-size: 10px;
841
+ font-weight: 600;
842
+ letter-spacing: 0.12em;
843
+ text-transform: uppercase;
844
+ color: var(--text-secondary);
845
+ border-bottom: 2px solid var(--border);
846
+ white-space: nowrap;
847
+ }
848
+
849
+ .comparison-table td {
850
+ padding: 12px 16px;
851
+ border-bottom: 1px solid var(--border);
852
+ }
853
+
854
+ .comparison-table tr:hover td {
855
+ background: var(--bg-secondary);
856
+ }
857
+
858
+ .comparison-table .model-name-cell {
859
+ font-weight: 600;
860
+ font-size: 12px;
861
+ }
862
+
863
+ .comparison-table .best {
864
+ color: var(--safe);
865
+ font-weight: 700;
866
+ }
867
+
868
+ /* METRIC CARDS */
869
+ .metrics-grid {
870
+ display: grid;
871
+ grid-template-columns: repeat(auto-fit, minmax(160px, 1fr));
872
+ gap: 16px;
873
+ margin-bottom: 32px;
874
+ }
875
+
876
+ .metric-card {
877
+ padding: 20px;
878
+ border: 1px solid var(--border);
879
+ text-align: center;
880
+ }
881
+
882
+ .metric-value {
883
+ font-size: 28px;
884
+ font-weight: 700;
885
+ margin-bottom: 4px;
886
+ }
887
+
888
+ .metric-label {
889
+ font-size: 10px;
890
+ color: var(--text-secondary);
891
+ letter-spacing: 0.1em;
892
+ text-transform: uppercase;
893
+ }
894
+
895
+ .metric-value.highlight {
896
+ color: var(--safe);
897
+ }
898
+
899
+ /* MODEL DETAIL CARDS */
900
+ .model-detail {
901
+ margin-bottom: 40px;
902
+ padding: 32px;
903
+ border: 1px solid var(--border);
904
+ border-left: 3px solid var(--accent);
905
+ }
906
+
907
+ .model-detail-header {
908
+ display: flex;
909
+ justify-content: space-between;
910
+ align-items: flex-start;
911
+ margin-bottom: 24px;
912
+ flex-wrap: wrap;
913
+ gap: 12px;
914
+ }
915
+
916
+ .model-detail-name {
917
+ font-size: 16px;
918
+ font-weight: 700;
919
+ letter-spacing: 0.02em;
920
+ }
921
+
922
+ .model-detail-type {
923
+ font-size: 10px;
924
+ font-weight: 600;
925
+ letter-spacing: 0.1em;
926
+ text-transform: uppercase;
927
+ color: var(--text-secondary);
928
+ padding: 4px 10px;
929
+ border: 1px solid var(--border);
930
+ }
931
+
932
+ /* CONFUSION MATRIX */
933
+ .confusion-matrix {
934
+ display: inline-grid;
935
+ grid-template-columns: auto auto auto;
936
+ gap: 0;
937
+ margin: 16px 0;
938
+ font-size: 13px;
939
+ }
940
+
941
+ .cm-header {
942
+ padding: 8px 20px;
943
+ font-size: 10px;
944
+ font-weight: 600;
945
+ letter-spacing: 0.08em;
946
+ text-transform: uppercase;
947
+ color: var(--text-secondary);
948
+ text-align: center;
949
+ }
950
+
951
+ .cm-cell {
952
+ padding: 16px 24px;
953
+ text-align: center;
954
+ font-weight: 700;
955
+ font-size: 14px;
956
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
957
+ border: 1px solid var(--border);
958
+ }
959
+
960
+ .cm-tp { background: var(--safe-bg); color: var(--safe); }
961
+ .cm-tn { background: var(--safe-bg); color: var(--safe); }
962
+ .cm-fp { background: var(--danger-bg); color: var(--danger); }
963
+ .cm-fn { background: var(--danger-bg); color: var(--danger); }
964
+
965
+ .cm-label {
966
+ padding: 8px 16px;
967
+ font-size: 10px;
968
+ font-weight: 600;
969
+ letter-spacing: 0.08em;
970
+ text-transform: uppercase;
971
+ color: var(--text-secondary);
972
+ display: flex;
973
+ align-items: center;
974
+ justify-content: center;
975
+ }
976
+
977
+ /* FEATURES LIST */
978
+ .features-list {
979
+ display: grid;
980
+ grid-template-columns: 1fr;
981
+ gap: 4px;
982
+ margin-top: 16px;
983
+ }
984
+
985
+ .feature-row {
986
+ display: flex;
987
+ justify-content: space-between;
988
+ align-items: center;
989
+ padding: 10px 16px;
990
+ background: var(--bg-secondary);
991
+ font-size: 12px;
992
+ }
993
+
994
+ .feature-row:nth-child(even) {
995
+ background: var(--bg);
996
+ }
997
+
998
+ .feature-rank {
999
+ font-size: 10px;
1000
+ color: var(--text-secondary);
1001
+ font-weight: 600;
1002
+ width: 24px;
1003
+ flex-shrink: 0;
1004
+ }
1005
+
1006
+ .feature-name {
1007
+ flex: 1;
1008
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
1009
+ font-size: 11px;
1010
+ }
1011
+
1012
+ .feature-importance {
1013
+ font-weight: 700;
1014
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
1015
+ font-size: 11px;
1016
+ text-align: right;
1017
+ width: 70px;
1018
+ flex-shrink: 0;
1019
+ }
1020
+
1021
+ .importance-bar-bg {
1022
+ flex: 1;
1023
+ max-width: 120px;
1024
+ height: 4px;
1025
+ background: var(--border);
1026
+ margin: 0 12px;
1027
+ flex-shrink: 0;
1028
+ }
1029
+
1030
+ .importance-bar-fill {
1031
+ height: 100%;
1032
+ background: var(--text);
1033
+ }
1034
+
1035
+ /* HYPERPARAMS */
1036
+ .params-grid {
1037
+ display: grid;
1038
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
1039
+ gap: 8px;
1040
+ }
1041
+
1042
+ .param-item {
1043
+ display: flex;
1044
+ justify-content: space-between;
1045
+ padding: 10px 16px;
1046
+ background: var(--bg-secondary);
1047
+ font-size: 12px;
1048
+ }
1049
+
1050
+ .param-key {
1051
+ color: var(--text-secondary);
1052
+ font-size: 11px;
1053
+ }
1054
+
1055
+ .param-value {
1056
+ font-weight: 600;
1057
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
1058
+ font-size: 11px;
1059
+ }
1060
+
1061
+ /* PIPELINE DIAGRAM */
1062
+ .pipeline {
1063
+ display: flex;
1064
+ margin: 24px 0;
1065
+ counter-reset: step;
1066
+ }
1067
+
1068
+ .pipeline-step {
1069
+ flex: 1;
1070
+ display: flex;
1071
+ align-items: center;
1072
+ gap: 10px;
1073
+ padding: 10px 12px;
1074
+ border: 1px solid var(--border);
1075
+ border-right: none;
1076
+ font-size: 11px;
1077
+ font-weight: 600;
1078
+ letter-spacing: 0.05em;
1079
+ text-transform: uppercase;
1080
+ background: var(--bg);
1081
+ position: relative;
1082
+ }
1083
+
1084
+ .pipeline-step:last-child { border-right: 1px solid var(--border); }
1085
+
1086
+ /* Arrow connector between steps */
1087
+ .pipeline-step:not(:last-child)::after {
1088
+ content: '\2192';
1089
+ position: absolute;
1090
+ right: -4px;
1091
+ top: 50%;
1092
+ transform: translate(50%, -50%);
1093
+ font-size: 11px;
1094
+ color: var(--text-secondary);
1095
+ background: var(--bg);
1096
+ z-index: 1;
1097
+ padding: 2px 0;
1098
+ line-height: 1;
1099
+ }
1100
+
1101
+ .pipeline-step .step-number {
1102
+ width: 22px;
1103
+ height: 22px;
1104
+ display: flex;
1105
+ align-items: center;
1106
+ justify-content: center;
1107
+ border-radius: 50%;
1108
+ background: var(--text);
1109
+ color: var(--bg);
1110
+ font-size: 11px;
1111
+ font-weight: 700;
1112
+ flex-shrink: 0;
1113
+ line-height: 1;
1114
+ }
1115
+
1116
+ /* COLLAPSIBLE TOGGLE */
1117
+ .collapsible-toggle {
1118
+ cursor: pointer;
1119
+ display: flex;
1120
+ align-items: center;
1121
+ user-select: none;
1122
+ }
1123
+
1124
+ .collapsible-toggle:hover {
1125
+ color: var(--text);
1126
+ }
1127
+
1128
+ .toggle-icon {
1129
+ margin-left: auto;
1130
+ font-size: 16px;
1131
+ font-weight: 400;
1132
+ color: var(--text-secondary);
1133
+ transition: transform 0.2s;
1134
+ flex-shrink: 0;
1135
+ width: 20px;
1136
+ text-align: center;
1137
+ line-height: 1;
1138
+ }
1139
+
1140
+ .collapsible-content {
1141
+ display: none;
1142
+ padding-top: 8px;
1143
+ }
1144
+
1145
+ .collapsible-content.open {
1146
+ display: block;
1147
+ }
1148
+
1149
+ /* FEATURE GRID */
1150
+ .feature-count {
1151
+ font-weight: 400;
1152
+ color: var(--text-secondary);
1153
+ letter-spacing: 0.05em;
1154
+ margin-left: 8px;
1155
+ }
1156
+
1157
+ .feature-category-label {
1158
+ font-size: 12px;
1159
+ font-weight: 700;
1160
+ letter-spacing: 0.04em;
1161
+ margin: 24px 0 10px;
1162
+ color: var(--text);
1163
+ }
1164
+
1165
+ .feature-category-label:first-of-type {
1166
+ margin-top: 0;
1167
+ }
1168
+
1169
+ .feature-grid {
1170
+ display: grid;
1171
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
1172
+ gap: 6px;
1173
+ }
1174
+
1175
+ .feature-chip {
1176
+ position: relative;
1177
+ padding: 8px 12px;
1178
+ font-family: 'SF Mono', 'Monaco', 'Inconsolata', monospace;
1179
+ font-size: 11px;
1180
+ background: var(--bg-secondary);
1181
+ border: 1px solid var(--border);
1182
+ cursor: default;
1183
+ transition: all 0.15s;
1184
+ white-space: nowrap;
1185
+ overflow: hidden;
1186
+ text-overflow: ellipsis;
1187
+ }
1188
+
1189
+ .feature-chip:hover {
1190
+ border-color: var(--text);
1191
+ background: var(--bg);
1192
+ overflow: visible;
1193
+ }
1194
+
1195
+ .feature-chip:hover::after {
1196
+ content: attr(data-tip);
1197
+ position: absolute;
1198
+ bottom: calc(100% + 10px);
1199
+ left: 50%;
1200
+ transform: translateX(-50%);
1201
+ padding: 12px 16px;
1202
+ background: var(--text);
1203
+ color: var(--bg);
1204
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
1205
+ font-size: 12px;
1206
+ line-height: 1.5;
1207
+ white-space: normal;
1208
+ width: max-content;
1209
+ max-width: 300px;
1210
+ z-index: 1000;
1211
+ pointer-events: none;
1212
+ box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2);
1213
+ }
1214
+
1215
+ .feature-chip:hover::before {
1216
+ content: '';
1217
+ position: absolute;
1218
+ bottom: calc(100% + 4px);
1219
+ left: 50%;
1220
+ transform: translateX(-50%);
1221
+ border: 6px solid transparent;
1222
+ border-top-color: var(--text);
1223
+ z-index: 1000;
1224
+ pointer-events: none;
1225
+ }
1226
+
1227
+ /* SUBSECTION SPACER */
1228
+ .subsection {
1229
+ margin-top: 24px;
1230
+ }
1231
+
1232
+ /* TABLE SCROLL WRAPPER */
1233
+ .table-scroll {
1234
+ overflow-x: auto;
1235
+ margin-bottom: 32px;
1236
+ }
1237
+
1238
+ /* INSIGHT CARDS (comparison tab) */
1239
+ .insights-grid {
1240
+ display: grid;
1241
+ grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
1242
+ gap: 16px;
1243
+ }
1244
+
1245
+ .insight-card {
1246
+ padding: 20px;
1247
+ border: 1px solid var(--border);
1248
+ }
1249
+
1250
+ .insight-card.insight-safe { border-left: 3px solid var(--safe); }
1251
+ .insight-card.insight-accent { border-left: 3px solid var(--accent); }
1252
+
1253
+ .insight-label {
1254
+ font-size: 11px;
1255
+ font-weight: 600;
1256
+ letter-spacing: 0.1em;
1257
+ text-transform: uppercase;
1258
+ margin-bottom: 8px;
1259
+ }
1260
+
1261
+ .insight-title {
1262
+ font-size: 16px;
1263
+ font-weight: 700;
1264
+ margin-bottom: 4px;
1265
+ }
1266
+
1267
+ .insight-desc {
1268
+ font-size: 12px;
1269
+ color: var(--text-secondary);
1270
+ }
1271
+
1272
+ /* MODELS PAGE RESPONSIVE */
1273
+ @media (max-width: 640px) {
1274
+ .models-page header {
1275
+ flex-direction: column;
1276
+ align-items: flex-start;
1277
+ gap: 16px;
1278
+ }
1279
+
1280
+ .page-title {
1281
+ font-size: 22px;
1282
+ }
1283
+
1284
+ .comparison-table {
1285
+ font-size: 11px;
1286
+ }
1287
+
1288
+ .comparison-table th,
1289
+ .comparison-table td {
1290
+ padding: 8px 10px;
1291
+ }
1292
+
1293
+ .metrics-grid {
1294
+ grid-template-columns: repeat(2, 1fr);
1295
+ }
1296
+
1297
+ .model-detail {
1298
+ padding: 20px;
1299
+ }
1300
+
1301
+ .pipeline {
1302
+ flex-wrap: wrap;
1303
+ }
1304
+
1305
+ .pipeline-step {
1306
+ flex: 1 1 40%;
1307
+ border-right: 1px solid var(--border);
1308
+ margin: -0.5px;
1309
+ }
1310
+
1311
+ .pipeline-step:not(:last-child)::after { display: none; }
1312
+
1313
+ .confusion-matrix {
1314
+ font-size: 11px;
1315
+ }
1316
+
1317
+ .cm-cell {
1318
+ padding: 12px 16px;
1319
+ font-size: 12px;
1320
+ }
1321
+
1322
+ .feature-grid {
1323
+ grid-template-columns: repeat(2, 1fr);
1324
+ }
1325
+ }
start_server.bat ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ REM Phishing Detection Server Startup Script
3
+ REM Starts the FastAPI server for phishing detection
4
+
5
+ echo ============================================
6
+ echo Phishing Detection Server
7
+ echo ============================================
8
+ echo.
9
+
10
+ REM Check if virtual environment exists
11
+ if not exist "venv\" (
12
+ echo ERROR: Virtual environment not found!
13
+ echo Please run: python -m venv venv
14
+ pause
15
+ exit /b 1
16
+ )
17
+
18
+ REM Activate virtual environment
19
+ echo [1/3] Activating virtual environment...
20
+ call venv\Scripts\activate.bat
21
+
22
+ REM Install server dependencies if needed
23
+ echo [2/3] Checking dependencies...
24
+ pip install -q -r requirements.txt
25
+
26
+ REM Start the server
27
+ echo [3/3] Starting server...
28
+ echo.
29
+ echo ============================================
30
+ echo Server running at: http://localhost:8000
31
+ echo API Docs: http://localhost:8000/docs
32
+ echo Press Ctrl+C to stop
33
+ echo ============================================
34
+ echo.
35
+
36
+ cd server
37
+ python -m uvicorn app:app --host 0.0.0.0 --port 8000 --reload
start_server.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Phishing Detection Server Startup Script (Linux/Mac)
3
+
4
+ echo "============================================"
5
+ echo " Phishing Detection Server"
6
+ echo "============================================"
7
+ echo ""
8
+
9
+ # Check if virtual environment exists
10
+ if [ ! -d "venv" ]; then
11
+ echo "ERROR: Virtual environment not found!"
12
+ echo "Please run: python -m venv venv"
13
+ exit 1
14
+ fi
15
+
16
+ # Activate virtual environment
17
+ echo "[1/3] Activating virtual environment..."
18
+ source venv/bin/activate
19
+
20
+ # Install server dependencies if needed
21
+ echo "[2/3] Checking dependencies..."
22
+ pip install -q -r server/requirements.txt
23
+
24
+ # Start the server
25
+ echo "[3/3] Starting server..."
26
+ echo ""
27
+ echo "============================================"
28
+ echo " Server running at: http://localhost:8000"
29
+ echo " API Docs: http://localhost:8000/docs"
30
+ echo " Press Ctrl+C to stop"
31
+ echo "============================================"
32
+ echo ""
33
+
34
+ cd server
35
+ python -m uvicorn app:app --host 0.0.0.0 --port 8000 --reload