Rasel Santillan commited on
Commit
8a9ac80
·
0 Parent(s):

Squashed clean history

Browse files
Files changed (12) hide show
  1. .dockerignore +83 -0
  2. .gitattributes +35 -0
  3. Dockerfile +73 -0
  4. README.md +206 -0
  5. app.py +28 -0
  6. categorization.py +103 -0
  7. main.py +305 -0
  8. model/__init__.py +8 -0
  9. model/model.py +298 -0
  10. model/url_feature_extractor.py +920 -0
  11. requirements.txt +22 -0
  12. run.py +40 -0
.dockerignore ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache and compiled files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ venv/
10
+ env/
11
+ ENV/
12
+ .venv/
13
+ virtualenv/
14
+
15
+ # IDE and editor files
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+ *.swo
20
+ *~
21
+ .DS_Store
22
+
23
+ # Git
24
+ .git/
25
+ .gitignore
26
+ .gitattributes
27
+
28
+ # Jupyter notebooks
29
+ *.ipynb
30
+ .ipynb_checkpoints/
31
+
32
+ # Testing
33
+ .pytest_cache/
34
+ .coverage
35
+ htmlcov/
36
+ .tox/
37
+ *.cover
38
+
39
+ # Documentation
40
+ *.md
41
+ docs/
42
+ README.md
43
+
44
+ # Logs
45
+ *.log
46
+ logs/
47
+
48
+ # Environment variables
49
+ .env
50
+ .env.local
51
+ .env.*.local
52
+
53
+ # Dataset and training files
54
+ data/
55
+ datasets/
56
+ *.csv
57
+ *.xlsx
58
+ *.json
59
+
60
+ # Model training artifacts (keep only the final model)
61
+ checkpoints/
62
+ experiments/
63
+ mlruns/
64
+
65
+ # Development dependencies
66
+ requirements-dev.txt
67
+ setup.py
68
+ setup.cfg
69
+
70
+ # CI/CD
71
+ .github/
72
+ .gitlab-ci.yml
73
+ .travis.yml
74
+
75
+ # Docker
76
+ Dockerfile.dev
77
+ docker-compose.yml
78
+ docker-compose.*.yml
79
+
80
+ # Misc
81
+ *.bak
82
+ *.tmp
83
+ .cache/
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile for Phishing URL Detection FastAPI Application
2
+ # Base image: Python 3.10 slim for smaller image size
3
+ FROM python:3.10-slim
4
+
5
+ # Set environment variables
6
+ ENV PYTHONUNBUFFERED=1 \
7
+ PYTHONDONTWRITEBYTECODE=1 \
8
+ PIP_NO_CACHE_DIR=1 \
9
+ PIP_DISABLE_PIP_VERSION_CHECK=1
10
+
11
+ # Install system dependencies required for ML libraries and Playwright
12
+ RUN apt-get update && apt-get install -y --no-install-recommends \
13
+ libgomp1 \
14
+ wget \
15
+ # Playwright/Chromium dependencies
16
+ libnss3 \
17
+ libnspr4 \
18
+ libatk1.0-0 \
19
+ libatk-bridge2.0-0 \
20
+ libcups2 \
21
+ libdrm2 \
22
+ libdbus-1-3 \
23
+ libxkbcommon0 \
24
+ libxcomposite1 \
25
+ libxdamage1 \
26
+ libxfixes3 \
27
+ libxrandr2 \
28
+ libgbm1 \
29
+ libpango-1.0-0 \
30
+ libcairo2 \
31
+ libasound2 \
32
+ libatspi2.0-0 \
33
+ libxshmfence1 \
34
+ && rm -rf /var/lib/apt/lists/*
35
+
36
+ # Create non-root user for security
37
+ RUN useradd -m -u 1000 user && \
38
+ mkdir -p /app && \
39
+ chown -R user:user /app
40
+
41
+ # Set working directory
42
+ WORKDIR /app
43
+
44
+ # Copy requirements first for better caching
45
+ COPY --chown=user:user requirements.txt .
46
+
47
+ # Switch to non-root user
48
+ USER user
49
+
50
+ # Add user's local bin to PATH
51
+ ENV PATH="/home/user/.local/bin:$PATH"
52
+
53
+ # Install Python dependencies
54
+ RUN pip install --user --no-cache-dir --upgrade pip && \
55
+ pip install --user --no-cache-dir -r requirements.txt
56
+
57
+ # Install Playwright browsers (as user)
58
+ # System dependencies are already installed above, so we just need the browser binaries
59
+ RUN python -m playwright install chromium
60
+
61
+ # Copy application code and model
62
+ COPY --chown=user:user . .
63
+
64
+ # Expose ports (7860 is default, 8000 for compatibility)
65
+ EXPOSE 7860 8000
66
+
67
+ # Health check (uses port 7860 by default)
68
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
69
+ CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
70
+
71
+ # Run the application
72
+ # Use app.py for HuggingFace Spaces compatibility, defaults to port 7860
73
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Phishing URL Detection API
3
+ emoji: 🔒
4
+ colorFrom: red
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ app_port: 7860
10
+ ---
11
+
12
+ # Phishing URL Detection API
13
+
14
+ A FastAPI-based REST API for detecting phishing URLs using machine learning. This service analyzes URL features and webpage content to classify URLs as legitimate or phishing attempts.
15
+
16
+ ## Features
17
+
18
+ - 🔍 **Real-time URL Analysis**: Extracts 43 features from URLs and their webpages
19
+ - 🤖 **Machine Learning**: Uses a stacking ensemble model for accurate predictions
20
+ - 🚀 **Fast API**: Built with FastAPI for high performance and automatic documentation
21
+ - 🐳 **Docker Support**: Containerized for easy deployment
22
+ - 📊 **Confidence Scores**: Returns prediction confidence for better decision-making
23
+ - 🔒 **CORS Enabled**: Accessible from web browsers
24
+
25
+ ## Project Structure
26
+
27
+ ```
28
+ url-phish-fastapi/
29
+ ├── main.py # FastAPI application
30
+ ├── model/
31
+ │ ├── __init__.py # Package initialization
32
+ │ ├── model.py # Model loading and prediction logic
33
+ │ ├── url_feature_extractor.py # Feature extraction from URLs
34
+ │ └── url_stacking_model.joblib # Pre-trained ML model
35
+ ├── requirements.txt # Python dependencies
36
+ ├── Dockerfile # Docker configuration
37
+ ├── .dockerignore # Docker ignore patterns
38
+ └── README.md # This file
39
+ ```
40
+
41
+ ## API Endpoints
42
+
43
+ ### Health Check
44
+ - **GET** `/` - Root endpoint
45
+ - **GET** `/health` - Health check endpoint
46
+
47
+ ### Prediction
48
+ - **POST** `/predict` - Analyze a URL for phishing detection
49
+
50
+ **Request Body:**
51
+ ```json
52
+ {
53
+ "url": "http://example.com"
54
+ }
55
+ ```
56
+
57
+ **Response:**
58
+ ```json
59
+ {
60
+ "url": "http://example.com",
61
+ "prediction": "legitimate",
62
+ "confidence": 0.95,
63
+ "predicted_label": 0,
64
+ "phish_probability": 0.05
65
+ }
66
+ ```
67
+
68
+ ### Interactive Documentation
69
+ - **Swagger UI**: `http://localhost:7860/docs`
70
+ - **ReDoc**: `http://localhost:7860/redoc`
71
+
72
+ ## Installation & Usage
73
+
74
+ ### Option 1: Local Development
75
+
76
+ 1. **Install dependencies:**
77
+ ```bash
78
+ pip install -r requirements.txt
79
+ ```
80
+
81
+ 2. **Run the application:**
82
+ ```bash
83
+ python app.py
84
+ ```
85
+
86
+ 3. **Access the API:**
87
+ - API: http://localhost:7860
88
+ - Docs: http://localhost:7860/docs
89
+
90
+ ### Option 2: Docker (Recommended)
91
+
92
+ 1. **Build the Docker image:**
93
+ ```bash
94
+ docker build -t phishing-url-api .
95
+ ```
96
+
97
+ 2. **Run the container:**
98
+ ```bash
99
+ docker run -p 7860:7860 phishing-url-api
100
+ ```
101
+
102
+ 3. **Access the API:**
103
+ - API: http://localhost:7860
104
+ - Docs: http://localhost:7860/docs
105
+
106
+ ### Option 3: Docker with Custom Port
107
+
108
+ ```bash
109
+ docker run -p 8000:8000 -e PORT=8000 phishing-url-api
110
+ ```
111
+
112
+ ## Testing
113
+
114
+ Run the test script to verify the API is working:
115
+
116
+ ```bash
117
+ python test_api.py
118
+ ```
119
+
120
+ Or use curl:
121
+
122
+ ```bash
123
+ # Health check
124
+ curl http://localhost:7860/health
125
+
126
+ # Predict URL
127
+ curl -X POST http://localhost:7860/predict \
128
+ -H "Content-Type: application/json" \
129
+ -d '{"url": "https://www.google.com"}'
130
+ ```
131
+
132
+ ## Model Information
133
+
134
+ The API uses a **stacking ensemble model** that combines multiple base classifiers:
135
+ - Random Forest
136
+ - Gradient Boosting
137
+ - XGBoost
138
+ - LightGBM
139
+ - Logistic Regression (meta-model)
140
+
141
+ ### Features Extracted (43 total)
142
+
143
+ The model analyzes various HTML elements and webpage characteristics:
144
+ - Form elements (inputs, buttons, password fields)
145
+ - Media elements (images, videos, audio)
146
+ - Structural elements (divs, tables, lists)
147
+ - Content metrics (text length, title length)
148
+ - Interactive elements (links, scripts, iframes)
149
+
150
+ ## Dependencies
151
+
152
+ - **FastAPI**: Web framework
153
+ - **Uvicorn**: ASGI server
154
+ - **Scikit-learn**: Machine learning
155
+ - **Pandas/NumPy**: Data processing
156
+ - **BeautifulSoup4**: HTML parsing
157
+ - **Requests**: HTTP requests
158
+ - **XGBoost/LightGBM**: Gradient boosting models
159
+
160
+ ## Error Handling
161
+
162
+ The API handles various error scenarios:
163
+ - **400 Bad Request**: Invalid or empty URL
164
+ - **500 Internal Server Error**: Model loading or prediction failures
165
+ - **Unknown Prediction**: When URL is unreachable or feature extraction fails
166
+
167
+ ## Performance Considerations
168
+
169
+ - Model is loaded once on startup (singleton pattern)
170
+ - Feature extraction may take 5-10 seconds for live URLs
171
+ - Unreachable URLs return "unknown" prediction
172
+ - HTTPS verification is disabled for broader compatibility
173
+
174
+ ## Security Notes
175
+
176
+ - The API makes HTTP requests to analyze URLs
177
+ - SSL verification is disabled for feature extraction
178
+ - Use appropriate network security when deploying
179
+ - Consider rate limiting for production use
180
+
181
+ ## Deployment
182
+
183
+ ### HuggingFace Spaces
184
+
185
+ This project is configured for deployment on HuggingFace Spaces using Docker SDK.
186
+
187
+ ### Other Platforms
188
+
189
+ The Docker container can be deployed on:
190
+ - AWS ECS/Fargate
191
+ - Google Cloud Run
192
+ - Azure Container Instances
193
+ - Kubernetes
194
+ - Any Docker-compatible platform
195
+
196
+ ## License
197
+
198
+ [Add your license information here]
199
+
200
+ ## Contributing
201
+
202
+ [Add contribution guidelines here]
203
+
204
+ ## Support
205
+
206
+ For issues and questions, please [create an issue](https://github.com/yourusername/yourrepo/issues).
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Entry point for Hugging Face Spaces deployment.
3
+ This file is required by Hugging Face Spaces and must be named 'app.py'.
4
+ """
5
+
6
+ import os
7
+ import uvicorn
8
+ from main import app
9
+
10
+ if __name__ == "__main__":
11
+ # Default to port 7860 (Hugging Face Spaces standard)
12
+ port = int(os.getenv("PORT", "7860"))
13
+ host = os.getenv("HOST", "0.0.0.0")
14
+
15
+ print("="*60)
16
+ print("🔒 Phishing URL Detection API")
17
+ print("="*60)
18
+ print(f"Starting server on {host}:{port}")
19
+ print(f"API Documentation: http://{host if host != '0.0.0.0' else 'localhost'}:{port}/docs")
20
+ print("="*60)
21
+
22
+ uvicorn.run(
23
+ app,
24
+ host=host,
25
+ port=port,
26
+ log_level="info"
27
+ )
28
+
categorization.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Risk categorization module for phishing detection results.
3
+
4
+ This module provides functions to categorize phishing probability scores
5
+ into risk categories and binary classifications.
6
+ """
7
+
8
+ from enum import Enum
9
+ from typing import Tuple
10
+
11
+
12
+ class RiskCategory(str, Enum):
13
+ """Risk category based on phishing probability score."""
14
+ SAFE = "Safe"
15
+ LOW = "Low"
16
+ MODERATE = "Moderate"
17
+ HIGH = "Dangerous"
18
+ CRITICAL = "Critical"
19
+
20
+
21
+ class BinaryClassification(str, Enum):
22
+ """Binary classification of phishing detection result."""
23
+ LEGITIMATE = "Legitimate"
24
+ PHISHING = "Phishing"
25
+
26
+
27
+ # Risk category thresholds (score is 0-100 scale)
28
+ RISK_THRESHOLDS = {
29
+ RiskCategory.SAFE: (0, 25), # score < 25
30
+ RiskCategory.LOW: (25, 50), # 25 <= score < 50
31
+ RiskCategory.MODERATE: (50, 70), # 50 <= score < 70
32
+ RiskCategory.HIGH: (70, 85), # 70 <= score < 85
33
+ RiskCategory.CRITICAL: (85, 101), # score >= 85
34
+ }
35
+
36
+ # Binary classification threshold
37
+ PHISHING_THRESHOLD = 70 # score >= 70 is classified as Phishing
38
+
39
+
40
+ def get_risk_category(phish_probability_score: float) -> RiskCategory:
41
+ """
42
+ Determine the risk category based on phishing probability score.
43
+
44
+ Args:
45
+ phish_probability_score: Phishing probability score (0-100 scale)
46
+
47
+ Returns:
48
+ RiskCategory: The corresponding risk category
49
+ """
50
+ if phish_probability_score < 25:
51
+ return RiskCategory.SAFE
52
+ elif phish_probability_score < 50:
53
+ return RiskCategory.LOW
54
+ elif phish_probability_score < 70:
55
+ return RiskCategory.MODERATE
56
+ elif phish_probability_score < 85:
57
+ return RiskCategory.HIGH
58
+ else:
59
+ return RiskCategory.CRITICAL
60
+
61
+
62
+ def get_binary_classification(phish_probability_score: float) -> BinaryClassification:
63
+ """
64
+ Determine the binary classification based on phishing probability score.
65
+
66
+ Args:
67
+ phish_probability_score: Phishing probability score (0-100 scale)
68
+
69
+ Returns:
70
+ BinaryClassification: Legitimate if score < 70, Phishing otherwise
71
+ """
72
+ if phish_probability_score < PHISHING_THRESHOLD:
73
+ return BinaryClassification.LEGITIMATE
74
+ else:
75
+ return BinaryClassification.PHISHING
76
+
77
+
78
+ def categorize_phishing_result(phish_probability: float) -> Tuple[RiskCategory, BinaryClassification, float]:
79
+ """
80
+ Categorize a phishing detection result.
81
+
82
+ This function takes a phishing probability (0-1 scale) and returns:
83
+ - Risk category (Safe, Low, Moderate, Dangerous, Critical)
84
+ - Binary classification (Legitimate or Phishing)
85
+ - The probability score on a 0-100 scale
86
+
87
+ Args:
88
+ phish_probability: Phishing probability from the model (0-1 scale)
89
+
90
+ Returns:
91
+ Tuple containing:
92
+ - RiskCategory: The risk category
93
+ - BinaryClassification: The binary classification
94
+ - float: The probability score on 0-100 scale
95
+ """
96
+ # Convert from 0-1 scale to 0-100 scale
97
+ score_100 = phish_probability * 100
98
+
99
+ risk_category = get_risk_category(score_100)
100
+ binary_classification = get_binary_classification(score_100)
101
+
102
+ return risk_category, binary_classification, score_100
103
+
main.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for phishing URL detection.
3
+ """
4
+
5
+ from fastapi import FastAPI, HTTPException, status
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel, Field, validator
8
+ from typing import Optional
9
+ import logging
10
+
11
+ from model.model import predict_url, load_model, get_meta_features_and_update
12
+ from categorization import categorize_phishing_result, RiskCategory, BinaryClassification
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Create FastAPI application
22
+ app = FastAPI(
23
+ title="Phishing URL Detection API",
24
+ description="API for detecting phishing URLs using machine learning. Analyzes URL features to classify URLs as legitimate or phishing attempts.",
25
+ version="1.0.0",
26
+ docs_url="/docs",
27
+ redoc_url="/redoc"
28
+ )
29
+
30
+ # Configure CORS middleware to allow web browser access
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=["*"], # In production, replace with specific origins
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+
40
+ # Pydantic models for request/response validation
41
+ class URLRequest(BaseModel):
42
+ """Request model for URL prediction."""
43
+ url: str = Field(
44
+ ...,
45
+ description="The URL to analyze for phishing detection",
46
+ example="http://example.com"
47
+ )
48
+
49
+ @validator('url')
50
+ def validate_url(cls, v):
51
+ """Validate that URL is not empty."""
52
+ if not v or not v.strip():
53
+ raise ValueError("URL cannot be empty")
54
+ return v.strip()
55
+
56
+
57
+ class PredictionResponse(BaseModel):
58
+ """Response model for URL prediction."""
59
+ url: str = Field(..., description="The analyzed URL")
60
+ prediction: str = Field(..., description="Prediction result: 'phishing', 'legitimate', or 'unknown'")
61
+ confidence: float = Field(..., description="Confidence score (0-1)")
62
+ predicted_label: int = Field(..., description="Predicted label: 0 (legitimate), 1 (phishing), -1 (unknown)")
63
+ phish_probability: float = Field(..., description="Probability of being phishing (0-1)")
64
+ phish_probability_percent: float = Field(..., description="Probability of being phishing (0-100 scale)")
65
+ risk_category: str = Field(..., description="Risk category: 'Safe', 'Low', 'Moderate', 'Dangerous', or 'Critical'")
66
+ binary_classification: str = Field(..., description="Binary classification: 'Legitimate' or 'Phishing'")
67
+ error: Optional[str] = Field(None, description="Error message if prediction failed")
68
+
69
+
70
+ class HealthResponse(BaseModel):
71
+ """Response model for health check."""
72
+ status: str = Field(..., description="Service status")
73
+ message: str = Field(..., description="Status message")
74
+
75
+
76
+ class UpdateRequest(BaseModel):
77
+ """Request model for online learning update."""
78
+ url: str = Field(..., description="The URL that was misclassified")
79
+ true_label: int = Field(..., description="True label: 0 (legitimate) or 1 (phishing)")
80
+
81
+ @validator('true_label')
82
+ def validate_label(cls, v):
83
+ """Validate that true_label is 0 or 1."""
84
+ if v not in [0, 1]:
85
+ raise ValueError("true_label must be 0 (legitimate) or 1 (phishing)")
86
+ return v
87
+
88
+
89
+ class UpdateResponse(BaseModel):
90
+ """Response model for online learning update."""
91
+ status: str = Field(..., description="Update status")
92
+ message: str = Field(..., description="Update message")
93
+ url: str = Field(..., description="The URL that was updated")
94
+ true_label: int = Field(..., description="The true label used for update")
95
+ meta_features: Optional[list] = Field(None, description="Meta features used for update")
96
+
97
+
98
+ # API Endpoints
99
+ @app.get("/", response_model=HealthResponse, tags=["Health"])
100
+ async def root():
101
+ """
102
+ Root endpoint - Health check.
103
+
104
+ Returns:
105
+ HealthResponse: Service status information
106
+ """
107
+ return HealthResponse(
108
+ status="healthy",
109
+ message="Phishing URL Detection API is running"
110
+ )
111
+
112
+
113
+ @app.get("/health", response_model=HealthResponse, tags=["Health"])
114
+ async def health_check():
115
+ """
116
+ Health check endpoint.
117
+
118
+ Returns:
119
+ HealthResponse: Service status information
120
+ """
121
+ return HealthResponse(
122
+ status="healthy",
123
+ message="Service is operational"
124
+ )
125
+
126
+
127
+ @app.post("/predict", response_model=PredictionResponse, tags=["Prediction"])
128
+ async def predict(request: URLRequest):
129
+ """
130
+ Predict whether a URL is phishing or legitimate.
131
+
132
+ This endpoint:
133
+ 1. Validates the input URL
134
+ 2. Extracts features from the URL and its webpage
135
+ 3. Uses a machine learning model to classify the URL
136
+ 4. Returns the prediction with confidence score
137
+
138
+ Args:
139
+ request: URLRequest containing the URL to analyze
140
+
141
+ Returns:
142
+ PredictionResponse: Prediction result with confidence score
143
+
144
+ Raises:
145
+ HTTPException: 400 for invalid input, 500 for server errors
146
+ """
147
+ try:
148
+ logger.info(f"Received prediction request for URL: {request.url}")
149
+
150
+ # Validate URL is not empty (already done by Pydantic validator)
151
+ if not request.url:
152
+ raise HTTPException(
153
+ status_code=status.HTTP_400_BAD_REQUEST,
154
+ detail="URL cannot be empty"
155
+ )
156
+
157
+ # Call prediction function
158
+ result = predict_url(request.url)
159
+
160
+ # Add risk categorization
161
+ risk_category, binary_classification, score_100 = categorize_phishing_result(
162
+ result['phish_probability']
163
+ )
164
+ result['phish_probability_percent'] = score_100
165
+ result['risk_category'] = risk_category.value
166
+ result['binary_classification'] = binary_classification.value
167
+
168
+ logger.info(f"Prediction successful: {result['prediction']} | Risk: {risk_category.value} | Classification: {binary_classification.value}")
169
+
170
+ return PredictionResponse(**result)
171
+
172
+ except ValueError as e:
173
+ # Handle validation errors
174
+ logger.error(f"Validation error: {str(e)}")
175
+ raise HTTPException(
176
+ status_code=status.HTTP_400_BAD_REQUEST,
177
+ detail=f"Invalid input: {str(e)}"
178
+ )
179
+
180
+ except FileNotFoundError as e:
181
+ # Handle model file not found
182
+ logger.error(f"Model file not found: {str(e)}")
183
+ raise HTTPException(
184
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
185
+ detail="Model file not found. Please ensure the model is properly deployed."
186
+ )
187
+
188
+ except Exception as e:
189
+ # Handle all other errors
190
+ logger.error(f"Prediction error: {str(e)}", exc_info=True)
191
+ raise HTTPException(
192
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
193
+ detail=f"An error occurred during prediction: {str(e)}"
194
+ )
195
+
196
+
197
+ @app.post("/update", response_model=UpdateResponse, tags=["Update"])
198
+ async def update_model(request: UpdateRequest):
199
+ """
200
+ Update the meta model using online learning with partial_fit.
201
+
202
+ This endpoint:
203
+ 1. Extracts features from the URL
204
+ 2. Generates meta-features using base models
205
+ 3. Updates the SGD meta model with partial_fit
206
+ 4. Saves the updated model
207
+
208
+ Args:
209
+ request: UpdateRequest containing URL and true label
210
+
211
+ Returns:
212
+ UpdateResponse: Update status and meta features used
213
+
214
+ Raises:
215
+ HTTPException: 400 for invalid input, 500 for server errors
216
+ """
217
+ try:
218
+ logger.info(f"Received update request for URL: {request.url} with label: {request.true_label}")
219
+
220
+ # Validate inputs
221
+ if not request.url or not request.url.strip():
222
+ raise HTTPException(
223
+ status_code=status.HTTP_400_BAD_REQUEST,
224
+ detail="URL cannot be empty"
225
+ )
226
+
227
+ if request.true_label not in [0, 1]:
228
+ raise HTTPException(
229
+ status_code=status.HTTP_400_BAD_REQUEST,
230
+ detail="true_label must be 0 (legitimate) or 1 (phishing)"
231
+ )
232
+
233
+ # Get meta features and update model
234
+ meta_features, updated = get_meta_features_and_update(request.url, request.true_label)
235
+
236
+ if not updated:
237
+ logger.warning(f"Failed to update model for URL: {request.url}")
238
+ return UpdateResponse(
239
+ status="failed",
240
+ message="Failed to update model - feature extraction may have failed",
241
+ url=request.url,
242
+ true_label=request.true_label,
243
+ meta_features=None
244
+ )
245
+
246
+ logger.info(f"✅ Model updated successfully for URL: {request.url}")
247
+
248
+ return UpdateResponse(
249
+ status="success",
250
+ message="Meta model updated successfully with partial_fit",
251
+ url=request.url,
252
+ true_label=request.true_label,
253
+ meta_features=meta_features.tolist() if meta_features is not None else None
254
+ )
255
+
256
+ except ValueError as e:
257
+ logger.error(f"Validation error in update: {str(e)}")
258
+ raise HTTPException(
259
+ status_code=status.HTTP_400_BAD_REQUEST,
260
+ detail=f"Invalid input: {str(e)}"
261
+ )
262
+
263
+ except Exception as e:
264
+ logger.error(f"Update error: {str(e)}", exc_info=True)
265
+ raise HTTPException(
266
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
267
+ detail=f"An error occurred during model update: {str(e)}"
268
+ )
269
+
270
+
271
+ # Startup event
272
+ @app.on_event("startup")
273
+ async def startup_event():
274
+ """
275
+ Startup event handler.
276
+ Loads the model on application startup to ensure it's ready.
277
+ """
278
+ try:
279
+ logger.info("Starting up Phishing URL Detection API...")
280
+ from model.model import load_model
281
+ load_model() # Pre-load model on startup
282
+ logger.info("✅ Model loaded successfully on startup")
283
+ except Exception as e:
284
+ logger.error(f"��� Failed to load model on startup: {str(e)}")
285
+ # Don't prevent startup, but log the error
286
+
287
+
288
+ # Shutdown event
289
+ @app.on_event("shutdown")
290
+ async def shutdown_event():
291
+ """
292
+ Shutdown event handler.
293
+ """
294
+ logger.info("Shutting down Phishing URL Detection API...")
295
+
296
+
297
+ if __name__ == "__main__":
298
+ import uvicorn
299
+ uvicorn.run(
300
+ "main:app",
301
+ host="0.0.0.0",
302
+ port=8000,
303
+ reload=True,
304
+ log_level="info"
305
+ )
model/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model package for phishing URL detection.
3
+ """
4
+
5
+ from .model import predict_url, load_model
6
+
7
+ __all__ = ['predict_url', 'load_model']
8
+
model/model.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model loading and prediction module for phishing URL detection.
3
+ """
4
+
5
+ import logging
6
+ import numpy as np
7
+ import pandas as pd
8
+ import joblib
9
+ from typing import Dict, Any, Optional, Tuple
10
+ import warnings
11
+ from huggingface_hub import hf_hub_download
12
+
13
+ # Import feature extraction function
14
+ from .url_feature_extractor import extract_features
15
+
16
+ warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Global variable to cache the loaded model (singleton pattern)
23
+ _model_cache: Optional[Dict[str, Any]] = None
24
+
25
+
26
+ def get_model_path() -> str:
27
+ """
28
+ Download the model from Hugging Face Hub and return the local path.
29
+
30
+ Returns:
31
+ str: Local path to the downloaded model file
32
+ """
33
+ model_path = hf_hub_download(
34
+ repo_id="xxemrzru/url-stacking-model",
35
+ filename="url_stacking_model.joblib"
36
+ )
37
+ return model_path
38
+
39
+
40
+ def load_model() -> Dict[str, Any]:
41
+ """
42
+ Load the saved stacking model from file.
43
+ Uses singleton pattern to load model only once.
44
+
45
+ Returns:
46
+ dict: Dictionary containing model components:
47
+ - base_models: Dictionary of base models
48
+ - meta_model: Final meta model
49
+ - feature_names: List of feature names
50
+ - model_names: List of base model names
51
+
52
+ Raises:
53
+ FileNotFoundError: If model file doesn't exist
54
+ Exception: If model loading fails
55
+ """
56
+ global _model_cache
57
+
58
+ # Return cached model if already loaded
59
+ if _model_cache is not None:
60
+ logger.info("Using cached model")
61
+ return _model_cache
62
+
63
+ try:
64
+ model_path = get_model_path()
65
+
66
+ logger.info(f"Loading model from: {model_path}")
67
+ model_data = joblib.load(model_path)
68
+
69
+ # Cache the model
70
+ _model_cache = {
71
+ "base_models": model_data["base_models"],
72
+ "meta_model": model_data["meta_model"],
73
+ "feature_names": model_data["feature_names"],
74
+ "model_names": model_data["model_names"]
75
+ }
76
+
77
+ logger.info("✅ Model loaded successfully")
78
+ return _model_cache
79
+
80
+ except Exception as e:
81
+ logger.error(f"❌ Failed to load model: {str(e)}")
82
+ raise
83
+
84
+
85
+ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[str, Any]) -> Dict[str, Any]:
86
+ """
87
+ Make predictions given a dictionary of extracted features.
88
+
89
+ Args:
90
+ features_dict: Dictionary where keys are feature names and values are feature values
91
+ model_components: The loaded components returned by load_model()
92
+
93
+ Returns:
94
+ dict: Contains 'predicted_label' (0 or 1) and 'phish_probability' (float)
95
+
96
+ Raises:
97
+ ValueError: If required features are missing
98
+ """
99
+ base_models = model_components["base_models"]
100
+ meta_model = model_components["meta_model"]
101
+ feature_names = model_components["feature_names"]
102
+ model_names = model_components["model_names"]
103
+
104
+ # Convert to DataFrame to ensure shape consistency
105
+ X = pd.DataFrame([features_dict])
106
+
107
+ # Ensure all required columns exist
108
+ missing_cols = set(feature_names) - set(X.columns)
109
+ if missing_cols:
110
+ raise ValueError(f"❌ Missing required features: {missing_cols}")
111
+
112
+ # Keep only known features and order them correctly
113
+ X = X[feature_names]
114
+
115
+ # Level 0: Base model predictions
116
+ meta_features = np.zeros((X.shape[0], len(base_models)))
117
+ for idx, (model_name, model) in enumerate(base_models.items()):
118
+ meta_features[:, idx] = model.predict_proba(X)[:, 1]
119
+
120
+ meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
121
+
122
+ # Level 1: Meta-model prediction
123
+ final_pred = meta_model.predict(meta_features_df)[0]
124
+ final_prob = meta_model.predict_proba(meta_features_df)[:, 1][0]
125
+
126
+ return {
127
+ "predicted_label": int(final_pred),
128
+ "phish_probability": float(final_prob)
129
+ }
130
+
131
+
132
+ def predict_url(url: str) -> Dict[str, Any]:
133
+ """
134
+ Main prediction function that takes a raw URL and returns prediction.
135
+
136
+ This function:
137
+ 1. Loads the model (cached after first load)
138
+ 2. Extracts features from the URL using url_feature_extractor
139
+ 3. Makes prediction using the stacking model
140
+
141
+ Args:
142
+ url: Raw URL string to analyze
143
+
144
+ Returns:
145
+ dict: Prediction result containing:
146
+ - url: The input URL
147
+ - prediction: "phishing" or "legitimate"
148
+ - confidence: Probability score (0-1)
149
+ - predicted_label: 0 (legitimate) or 1 (phishing)
150
+ - phish_probability: Same as confidence
151
+
152
+ Raises:
153
+ Exception: If feature extraction or prediction fails
154
+ """
155
+ try:
156
+ # Load model (uses cache if already loaded)
157
+ model_components = load_model()
158
+
159
+ # Extract features from URL
160
+ logger.info(f"Extracting features from URL: {url}")
161
+ features_dict = extract_features(url)
162
+
163
+ # Check if feature extraction failed (all -1 values indicate extraction failure)
164
+ if all(v == -1 for v in features_dict.values()):
165
+ logger.warning(f"Feature extraction failed for URL: {url}")
166
+ # Return a default prediction for unreachable URLs
167
+ return {
168
+ "url": url,
169
+ "prediction": "unknown",
170
+ "confidence": 0.0,
171
+ "predicted_label": -1,
172
+ "phish_probability": 0.0,
173
+ "error": "Failed to extract features - URL may be unreachable"
174
+ }
175
+
176
+ # Make prediction
177
+ logger.info("Making prediction...")
178
+ prediction_result = predict_from_features(features_dict, model_components)
179
+
180
+ # Format response
181
+ predicted_label = prediction_result["predicted_label"]
182
+ phish_probability = prediction_result["phish_probability"]
183
+
184
+ result = {
185
+ "url": url,
186
+ "prediction": "phishing" if predicted_label == 1 else "legitimate",
187
+ "confidence": phish_probability if predicted_label == 1 else (1 - phish_probability),
188
+ "predicted_label": predicted_label,
189
+ "phish_probability": phish_probability
190
+ }
191
+
192
+ logger.info(f"✅ Prediction complete: {result['prediction']} (confidence: {result['confidence']:.2%})")
193
+ return result
194
+
195
+ except Exception as e:
196
+ logger.error(f"❌ Prediction failed: {str(e)}")
197
+ raise
198
+
199
+
200
+ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np.ndarray], bool]:
201
+ """
202
+ Extract meta-features from URL and update the SGD meta model using partial_fit.
203
+
204
+ This function:
205
+ 1. Extracts features from the URL
206
+ 2. Generates meta-features using base models (probability outputs)
207
+ 3. Updates the SGD meta model with partial_fit(meta_features, true_label)
208
+ 4. Saves the updated model to disk
209
+
210
+ Args:
211
+ url: Raw URL string to extract features from
212
+ true_label: True label (0 for legitimate, 1 for phishing)
213
+
214
+ Returns:
215
+ tuple: (meta_features_array, success_flag)
216
+ - meta_features_array: numpy array of meta features used for update
217
+ - success_flag: boolean indicating if update was successful
218
+
219
+ Raises:
220
+ Exception: If feature extraction or model update fails
221
+ """
222
+ try:
223
+ # Load model components
224
+ model_components = load_model()
225
+ base_models = model_components["base_models"]
226
+ meta_model = model_components["meta_model"]
227
+ feature_names = model_components["feature_names"]
228
+ model_names = model_components["model_names"]
229
+
230
+ # Extract features from URL
231
+ logger.info(f"Extracting features for update from URL: {url}")
232
+ features_dict = extract_features(url)
233
+
234
+ # Check if feature extraction failed
235
+ if all(v == -1 for v in features_dict.values()):
236
+ logger.warning(f"Feature extraction failed for URL update: {url}")
237
+ return None, False
238
+
239
+ # Convert to DataFrame and ensure proper ordering
240
+ X = pd.DataFrame([features_dict])
241
+ missing_cols = set(feature_names) - set(X.columns)
242
+ if missing_cols:
243
+ raise ValueError(f"Missing required features: {missing_cols}")
244
+ X = X[feature_names]
245
+
246
+ # Generate meta-features using base models (probability outputs)
247
+ meta_features = np.zeros((X.shape[0], len(base_models)))
248
+ for idx, (model_name, model) in enumerate(base_models.items()):
249
+ meta_features[:, idx] = model.predict_proba(X)[:, 1]
250
+
251
+ meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
252
+
253
+ # Update the SGD meta model using partial_fit
254
+ logger.info(f"Updating meta model with partial_fit for label: {true_label}")
255
+ meta_model.partial_fit(meta_features_df, [true_label], classes=[0, 1])
256
+
257
+ # Update the cached model with the new meta model
258
+ global _model_cache
259
+ if _model_cache is not None:
260
+ _model_cache["meta_model"] = meta_model
261
+
262
+ # Save the updated model to disk
263
+ save_updated_model(model_components, meta_model)
264
+
265
+ logger.info(f"✅ Model updated successfully for URL: {url}")
266
+ return meta_features_df.values[0], True
267
+
268
+ except Exception as e:
269
+ logger.error(f"❌ Failed to update model: {str(e)}")
270
+ return None, False
271
+
272
+
273
+ def save_updated_model(model_components: Dict[str, Any], updated_meta_model) -> None:
274
+ """
275
+ Save the updated model components to disk.
276
+
277
+ Args:
278
+ model_components: Dictionary containing model components
279
+ updated_meta_model: The updated SGD meta model
280
+ """
281
+ try:
282
+ model_path = get_model_path()
283
+
284
+ # Create updated model data
285
+ updated_model_data = {
286
+ "base_models": model_components["base_models"],
287
+ "meta_model": updated_meta_model, # Use the updated meta model
288
+ "feature_names": model_components["feature_names"],
289
+ "model_names": model_components["model_names"]
290
+ }
291
+
292
+ # Save to disk
293
+ joblib.dump(updated_model_data, model_path)
294
+ logger.info(f"✅ Updated model saved to: {model_path}")
295
+
296
+ except Exception as e:
297
+ logger.error(f"❌ Failed to save updated model: {str(e)}")
298
+ raise
model/url_feature_extractor.py ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL Feature Extraction System for Phishing Detection
3
+ Extracts 43 specific features from URLs and their corresponding webpages.
4
+ """
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import pandas as pd
9
+ from urllib.parse import urlparse
10
+ import warnings
11
+ import time
12
+ import logging
13
+ import numpy as np
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
16
+ from functools import wraps
17
+ import asyncio
18
+ import sys
19
+
20
+ # Playwright imports (optional - graceful degradation if not installed)
21
+ try:
22
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
23
+ PLAYWRIGHT_AVAILABLE = True
24
+ except ImportError:
25
+ PLAYWRIGHT_AVAILABLE = False
26
+ PlaywrightTimeoutError = Exception # Fallback for type hints
27
+
28
+ warnings.filterwarnings('ignore')
29
+
30
+ # Configure logging
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def _is_running_in_event_loop():
36
+ """
37
+ Check if code is running inside an asyncio event loop.
38
+
39
+ Returns:
40
+ bool: True if running in an event loop, False otherwise
41
+ """
42
+ try:
43
+ asyncio.get_running_loop()
44
+ return True
45
+ except RuntimeError:
46
+ return False
47
+
48
+ # Configuration constants
49
+ FEATURE_EXTRACTION_MAX_RETRIES = 3
50
+ FEATURE_EXTRACTION_RETRY_DELAY = 0.3 # seconds between retries
51
+ PAGE_LOAD_TIMEOUT = 20 # seconds to wait for page load
52
+ DYNAMIC_CONTENT_WAIT = 3 # seconds to wait for dynamic content after page load
53
+
54
+
55
+ def retry_feature_extraction(max_retries=FEATURE_EXTRACTION_MAX_RETRIES, delay=FEATURE_EXTRACTION_RETRY_DELAY):
56
+ """
57
+ Decorator to retry feature extraction with exponential backoff.
58
+
59
+ Args:
60
+ max_retries (int): Maximum number of retry attempts
61
+ delay (float): Initial delay between retries in seconds
62
+
63
+ Returns:
64
+ Decorated function with retry logic
65
+ """
66
+ def decorator(func):
67
+ @wraps(func)
68
+ def wrapper(*args, **kwargs):
69
+ last_exception = None
70
+ for attempt in range(max_retries):
71
+ try:
72
+ result = func(*args, **kwargs)
73
+ # If we got a valid result (not np.nan), return it
74
+ if result is not None and not (isinstance(result, float) and np.isnan(result)):
75
+ return result
76
+ # If result is np.nan or None, retry
77
+ if attempt < max_retries - 1:
78
+ time.sleep(delay * (attempt + 1)) # Exponential backoff
79
+ except Exception as e:
80
+ last_exception = e
81
+ if attempt < max_retries - 1:
82
+ time.sleep(delay * (attempt + 1))
83
+ continue
84
+
85
+ # All retries exhausted, return np.nan
86
+ if last_exception:
87
+ logger.debug(f"Feature extraction failed after {max_retries} attempts: {last_exception}")
88
+ return np.nan
89
+ return wrapper
90
+ return decorator
91
+
92
+
93
+ def create_playwright_browser():
94
+ """
95
+ Create a Playwright browser context for dynamic content extraction.
96
+
97
+ Returns:
98
+ tuple: (playwright instance, browser, context, page) or (None, None, None, None) if failed
99
+ """
100
+ if not PLAYWRIGHT_AVAILABLE:
101
+ logger.warning("Playwright is not installed. Install with: pip install playwright && playwright install")
102
+ return None, None, None, None
103
+
104
+ try:
105
+ # Start Playwright
106
+ playwright = sync_playwright().start()
107
+
108
+ # Launch browser with stealth options
109
+ browser = playwright.chromium.launch(
110
+ headless=True,
111
+ args=[
112
+ '--no-sandbox',
113
+ '--disable-dev-shm-usage',
114
+ '--disable-gpu',
115
+ '--disable-extensions',
116
+ '--disable-blink-features=AutomationControlled',
117
+ ]
118
+ )
119
+
120
+ # Create context with stealth settings
121
+ context = browser.new_context(
122
+ viewport={'width': 1920, 'height': 1080},
123
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
124
+ locale='en-US',
125
+ timezone_id='America/New_York',
126
+ permissions=[],
127
+ extra_http_headers={
128
+ 'Accept-Language': 'en-US,en;q=0.9',
129
+ 'DNT': '1',
130
+ },
131
+ ignore_https_errors=True,
132
+ )
133
+
134
+ # Add init script to hide webdriver property
135
+ context.add_init_script("""
136
+ Object.defineProperty(navigator, 'webdriver', {
137
+ get: () => undefined
138
+ });
139
+
140
+ // Override the navigator.plugins to avoid detection
141
+ Object.defineProperty(navigator, 'plugins', {
142
+ get: () => [1, 2, 3, 4, 5]
143
+ });
144
+
145
+ // Override the navigator.languages to avoid detection
146
+ Object.defineProperty(navigator, 'languages', {
147
+ get: () => ['en-US', 'en']
148
+ });
149
+ """)
150
+
151
+ # Create a new page
152
+ page = context.new_page()
153
+
154
+ # Set default timeout
155
+ page.set_default_timeout(PAGE_LOAD_TIMEOUT * 1000) # Convert to milliseconds
156
+
157
+ logger.info("✓ Playwright browser created successfully")
158
+ return playwright, browser, context, page
159
+
160
+ except Exception as e:
161
+ logger.warning(f"Failed to create Playwright browser: {type(e).__name__}: {str(e)[:200]}")
162
+ logger.info("Playwright will be skipped. Install with: pip install playwright && playwright install")
163
+ return None, None, None, None
164
+
165
+
166
+ def fetch_page_with_playwright(url, page=None):
167
+ """
168
+ Fetch a webpage using Playwright to handle dynamic JavaScript content.
169
+
170
+ Args:
171
+ url (str): URL to fetch
172
+ page (playwright.sync_api.Page, optional): Existing page instance
173
+
174
+ Returns:
175
+ tuple: (BeautifulSoup object, (playwright, browser, context, page)) or (None, None) if failed
176
+ """
177
+ resources_created = False
178
+ playwright_instance = None
179
+ browser = None
180
+ context = None
181
+
182
+ try:
183
+ if page is None:
184
+ playwright_instance, browser, context, page = create_playwright_browser()
185
+ resources_created = True
186
+
187
+ if page is None:
188
+ return None, None
189
+
190
+ logger.info(f"Fetching URL with Playwright: {url}")
191
+
192
+ # Navigate to the URL
193
+ try:
194
+ response = page.goto(url, wait_until='networkidle', timeout=PAGE_LOAD_TIMEOUT * 1000)
195
+
196
+ # Check if navigation was successful
197
+ if response and response.status >= 400:
198
+ logger.warning(f"Playwright received HTTP {response.status}")
199
+ except PlaywrightTimeoutError:
200
+ logger.warning("Playwright navigation timeout, continuing anyway...")
201
+ except Exception as nav_error:
202
+ logger.warning(f"Playwright navigation error: {nav_error}")
203
+ # Continue anyway - page might have partially loaded
204
+
205
+ # Wait for document ready state
206
+ try:
207
+ page.wait_for_load_state('domcontentloaded', timeout=10000)
208
+ page.wait_for_load_state('load', timeout=10000)
209
+ except PlaywrightTimeoutError:
210
+ logger.debug("Load state timeout, continuing...")
211
+
212
+ # Additional wait for dynamic content to load
213
+ time.sleep(DYNAMIC_CONTENT_WAIT)
214
+
215
+ # Wait for body element to be present
216
+ try:
217
+ page.wait_for_selector('body', timeout=10000)
218
+ except PlaywrightTimeoutError:
219
+ logger.debug("Body selector timeout, continuing...")
220
+
221
+ # Get the fully rendered page source
222
+ page_source = page.content()
223
+
224
+ # Parse with BeautifulSoup
225
+ soup = BeautifulSoup(page_source, 'html.parser')
226
+
227
+ logger.info(f"✓ Successfully fetched and rendered page with Playwright")
228
+
229
+ # Return soup and resources (let caller handle cleanup)
230
+ if resources_created:
231
+ return soup, (playwright_instance, browser, context, page)
232
+ else:
233
+ return soup, None
234
+
235
+ except Exception as e:
236
+ logger.warning(f"Playwright fetch failed: {type(e).__name__}: {str(e)[:100]}")
237
+ if resources_created:
238
+ try:
239
+ if page:
240
+ page.close()
241
+ if context:
242
+ context.close()
243
+ if browser:
244
+ browser.close()
245
+ if playwright_instance:
246
+ playwright_instance.stop()
247
+ except:
248
+ pass
249
+ return None, None
250
+
251
+
252
+ def fetch_page_with_playwright_safe(url, page=None):
253
+ """
254
+ Thread-safe wrapper for fetch_page_with_playwright that works in both sync and async contexts.
255
+
256
+ This function detects if it's running inside an asyncio event loop (e.g., FastAPI/uvicorn)
257
+ and automatically runs the Playwright sync API in a separate thread to avoid conflicts.
258
+
259
+ Args:
260
+ url (str): URL to fetch
261
+ page (playwright.sync_api.Page, optional): Existing page instance
262
+
263
+ Returns:
264
+ tuple: (BeautifulSoup object, playwright_resources) or (None, None) if failed
265
+ """
266
+ if _is_running_in_event_loop():
267
+ # Running in async context (e.g., FastAPI) - use thread pool
268
+ logger.debug("Detected async context - running Playwright in separate thread")
269
+ try:
270
+ # Run the sync function in a thread pool executor
271
+ # This isolates Playwright's sync API from the asyncio event loop
272
+ import concurrent.futures
273
+ with concurrent.futures.ThreadPoolExecutor() as executor:
274
+ future = executor.submit(fetch_page_with_playwright, url, page)
275
+ result = future.result(timeout=PAGE_LOAD_TIMEOUT + 30) # Add buffer to timeout
276
+ return result
277
+ except Exception as e:
278
+ logger.warning(f"Failed to run Playwright in thread: {type(e).__name__}: {str(e)[:100]}")
279
+ return None, None
280
+ else:
281
+ # Running in sync context (e.g., direct script execution) - call directly
282
+ logger.debug("Detected sync context - running Playwright directly")
283
+ return fetch_page_with_playwright(url, page)
284
+
285
+
286
+ def get_modern_browser_headers(url=None):
287
+ """
288
+ Generate modern browser headers to mimic a real Chrome browser.
289
+
290
+ Args:
291
+ url (str, optional): The target URL for setting referer/origin
292
+
293
+ Returns:
294
+ dict: Dictionary of HTTP headers
295
+ """
296
+ headers = {
297
+ # Modern Chrome User-Agent (Chrome 120+)
298
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
299
+
300
+ # Accept headers
301
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
302
+ 'Accept-Language': 'en-US,en;q=0.9',
303
+ 'Accept-Encoding': 'gzip, deflate, br',
304
+
305
+ # Security headers (Sec-Fetch-* headers)
306
+ 'Sec-Fetch-Dest': 'document',
307
+ 'Sec-Fetch-Mode': 'navigate',
308
+ 'Sec-Fetch-Site': 'none',
309
+ 'Sec-Fetch-User': '?1',
310
+
311
+ # Additional browser headers
312
+ 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
313
+ 'Sec-Ch-Ua-Mobile': '?0',
314
+ 'Sec-Ch-Ua-Platform': '"Windows"',
315
+
316
+ # Connection settings
317
+ 'Connection': 'keep-alive',
318
+ 'Upgrade-Insecure-Requests': '1',
319
+
320
+ # DNT (Do Not Track)
321
+ 'DNT': '1',
322
+
323
+ # Cache control
324
+ 'Cache-Control': 'max-age=0',
325
+ }
326
+
327
+ # Add referer if URL is provided
328
+ if url:
329
+ try:
330
+ parsed = urlparse(url)
331
+ if parsed.scheme and parsed.netloc:
332
+ origin = f"{parsed.scheme}://{parsed.netloc}"
333
+ headers['Origin'] = origin
334
+ headers['Referer'] = origin + '/'
335
+ except Exception:
336
+ pass
337
+
338
+ return headers
339
+
340
+
341
+ def create_session_with_retries(max_retries=3):
342
+ """
343
+ Create a requests session with retry logic and connection pooling.
344
+
345
+ Args:
346
+ max_retries (int): Maximum number of retries for failed requests
347
+
348
+ Returns:
349
+ requests.Session: Configured session object
350
+ """
351
+ session = requests.Session()
352
+
353
+ # Configure retry strategy
354
+ retry_strategy = Retry(
355
+ total=max_retries,
356
+ backoff_factor=1, # Wait 1s, 2s, 4s between retries
357
+ status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes
358
+ allowed_methods=["GET", "HEAD"], # Only retry safe methods
359
+ raise_on_status=False # Don't raise exception, let us handle it
360
+ )
361
+
362
+ # Mount adapter with retry strategy
363
+ adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=10)
364
+ session.mount("http://", adapter)
365
+ session.mount("https://", adapter)
366
+
367
+ return session
368
+
369
+
370
+ def preprocess_url(url):
371
+ """
372
+ Add http:// scheme to URL if missing.
373
+
374
+ Args:
375
+ url (str): Original URL
376
+
377
+ Returns:
378
+ str: URL with scheme
379
+ """
380
+ url = url.strip()
381
+ if not url.startswith(('http://', 'https://')):
382
+ return f'http://{url}'
383
+ return url
384
+
385
+
386
+ def extract_feature_with_retry(soup, feature_name, extraction_func, max_retries=FEATURE_EXTRACTION_MAX_RETRIES):
387
+ """
388
+ Extract a single feature with retry logic.
389
+
390
+ All features are returned as integers:
391
+ - 'has_*' features return binary 0 or 1
392
+ - 'number_of_*' and 'length_of_*' features return whole numbers (integers)
393
+ - On failure, returns -1 (instead of np.nan) to maintain integer type consistency
394
+
395
+ Args:
396
+ soup (BeautifulSoup): Parsed HTML content
397
+ feature_name (str): Name of the feature being extracted
398
+ extraction_func (callable): Function that performs the extraction
399
+ max_retries (int): Maximum number of retry attempts
400
+
401
+ Returns:
402
+ int: Feature value as integer, or -1 if all retries fail
403
+ """
404
+ last_exception = None
405
+
406
+ for attempt in range(max_retries):
407
+ try:
408
+ result = extraction_func(soup)
409
+
410
+ # If we got a valid result, cast to int and return it
411
+ if result is not None and not (isinstance(result, float) and np.isnan(result)):
412
+ if attempt > 0:
413
+ logger.debug(f"Feature '{feature_name}' extracted successfully on attempt {attempt + 1}")
414
+ # Ensure integer type for all features
415
+ return int(result)
416
+
417
+ # If result is None or np.nan, retry with a small delay
418
+ if attempt < max_retries - 1:
419
+ time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
420
+
421
+ except Exception as e:
422
+ last_exception = e
423
+ if attempt < max_retries - 1:
424
+ logger.debug(f"Retry {attempt + 1}/{max_retries} for '{feature_name}': {type(e).__name__}")
425
+ time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
426
+ continue
427
+
428
+ # All retries exhausted - return -1 to indicate failure while maintaining integer type
429
+ if last_exception:
430
+ logger.debug(f"Error extracting {feature_name} after {max_retries} attempts: {last_exception}")
431
+
432
+ return -1
433
+
434
+
435
+ def extract_features(url):
436
+ """
437
+ Extract all 43 features from a URL and its webpage.
438
+
439
+ Args:
440
+ url (str): URL to extract features from
441
+
442
+ Returns:
443
+ dict: Dictionary containing all 43 features as integers.
444
+ - 'has_*' features: 0 (not present), 1 (present), or -1 (extraction failed/unreachable)
445
+ - 'number_of_*' and 'length_of_*' features: >= 0 count/length, or -1 (extraction failed/unreachable)
446
+ """
447
+ # Initialize all features with -1 (for unreachable sites)
448
+ # Using -1 instead of None to maintain integer type consistency
449
+ features = {
450
+ 'has_title': -1,
451
+ 'has_input': -1,
452
+ 'has_button': -1,
453
+ 'has_image': -1,
454
+ 'has_submit': -1,
455
+ 'has_link': -1,
456
+ 'has_password': -1,
457
+ 'has_email_input': -1,
458
+ 'has_hidden_element': -1,
459
+ 'has_audio': -1,
460
+ 'has_video': -1,
461
+ 'number_of_inputs': -1,
462
+ 'number_of_buttons': -1,
463
+ 'number_of_images': -1,
464
+ 'number_of_option': -1,
465
+ 'number_of_list': -1,
466
+ 'number_of_th': -1,
467
+ 'number_of_tr': -1,
468
+ 'number_of_href': -1,
469
+ 'number_of_paragraph': -1,
470
+ 'number_of_script': -1,
471
+ 'length_of_title': -1,
472
+ 'has_h1': -1,
473
+ 'has_h2': -1,
474
+ 'has_h3': -1,
475
+ 'length_of_text': -1,
476
+ 'number_of_clickable_button': -1,
477
+ 'number_of_a': -1,
478
+ 'number_of_img': -1,
479
+ 'number_of_div': -1,
480
+ 'number_of_figure': -1,
481
+ 'has_footer': -1,
482
+ 'has_form': -1,
483
+ 'has_text_area': -1,
484
+ 'has_iframe': -1,
485
+ 'has_text_input': -1,
486
+ 'number_of_meta': -1,
487
+ 'has_nav': -1,
488
+ 'has_object': -1,
489
+ 'has_picture': -1,
490
+ 'number_of_sources': -1,
491
+ 'number_of_span': -1,
492
+ 'number_of_table': -1
493
+ }
494
+
495
+ # Preprocess URL
496
+ processed_url = preprocess_url(url)
497
+
498
+ # Try multiple approaches with increasing robustness
499
+ response = None
500
+ soup = None
501
+ last_error = None
502
+
503
+ # Approach 1: Use session with retry logic and modern headers
504
+ try:
505
+ logger.info(f"Attempting to fetch URL with session and retries: {processed_url}")
506
+ session = create_session_with_retries(max_retries=3)
507
+ headers = get_modern_browser_headers(processed_url)
508
+
509
+ response = session.get(
510
+ processed_url,
511
+ headers=headers,
512
+ timeout=15,
513
+ allow_redirects=True,
514
+ verify=False
515
+ )
516
+
517
+ # Check if we got a successful response
518
+ if response.status_code == 200:
519
+ logger.info(f"✓ Successfully fetched URL (status: {response.status_code})")
520
+ # Decode content with UTF-8 and replace errors to avoid encoding warnings
521
+ html_content = response.content.decode('utf-8', errors='replace')
522
+ soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
523
+ else:
524
+ logger.warning(f"Received HTTP {response.status_code} for {processed_url}")
525
+ raise requests.exceptions.HTTPError(f"HTTP {response.status_code}")
526
+
527
+ except requests.exceptions.Timeout as e:
528
+ last_error = f"Timeout error: Request took longer than 15 seconds"
529
+ logger.warning(f"✗ {last_error}")
530
+ except requests.exceptions.ConnectionError as e:
531
+ last_error = f"Connection error: Unable to establish connection to {processed_url}"
532
+ logger.warning(f"✗ {last_error}")
533
+ except requests.exceptions.HTTPError as e:
534
+ last_error = f"HTTP error: {str(e)}"
535
+ logger.warning(f"✗ {last_error}")
536
+ except requests.exceptions.TooManyRedirects as e:
537
+ last_error = f"Too many redirects: URL redirected too many times"
538
+ logger.warning(f"✗ {last_error}")
539
+ except Exception as e:
540
+ last_error = f"Unexpected error in approach 1: {type(e).__name__}: {str(e)[:100]}"
541
+ logger.warning(f"✗ {last_error}")
542
+
543
+ # Approach 2: Fallback to simple request with enhanced headers if first approach failed
544
+ if soup is None:
545
+ try:
546
+ logger.info(f"Trying fallback approach with enhanced headers...")
547
+ time.sleep(2) # Brief delay before retry
548
+
549
+ # More complete headers to mimic a real browser
550
+ headers = {
551
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
552
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
553
+ 'Accept-Language': 'en-US,en;q=0.9',
554
+ 'Accept-Encoding': 'gzip, deflate, br',
555
+ 'DNT': '1',
556
+ 'Connection': 'keep-alive',
557
+ 'Upgrade-Insecure-Requests': '1',
558
+ 'Sec-Fetch-Dest': 'document',
559
+ 'Sec-Fetch-Mode': 'navigate',
560
+ 'Sec-Fetch-Site': 'none',
561
+ 'Sec-Fetch-User': '?1',
562
+ 'Cache-Control': 'max-age=0',
563
+ }
564
+
565
+ response = requests.get(
566
+ processed_url,
567
+ headers=headers,
568
+ timeout=10,
569
+ allow_redirects=True,
570
+ verify=False
571
+ )
572
+
573
+ if response.status_code == 200:
574
+ logger.info(f"✓ Fallback approach succeeded (status: {response.status_code})")
575
+ # Decode content with UTF-8 and replace errors to avoid encoding warnings
576
+ html_content = response.content.decode('utf-8', errors='replace')
577
+ soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
578
+ else:
579
+ last_error = f"HTTP {response.status_code}: {response.reason}"
580
+ logger.warning(f"✗ Fallback failed with HTTP {response.status_code}")
581
+
582
+ except Exception as e:
583
+ last_error = f"Fallback error: {type(e).__name__}: {str(e)[:100]}"
584
+ logger.warning(f"✗ {last_error}")
585
+
586
+ # Approach 3: Use Playwright for dynamic content if previous approaches failed
587
+ playwright_resources = None
588
+ if soup is None:
589
+ try:
590
+ logger.info(f"Trying Playwright approach for dynamic content...")
591
+ time.sleep(1) # Brief delay before retry
592
+
593
+ soup, playwright_resources = fetch_page_with_playwright_safe(processed_url)
594
+
595
+ if soup is not None:
596
+ logger.info(f"✓ Playwright approach succeeded")
597
+ else:
598
+ last_error = "Playwright fetch failed"
599
+ logger.warning(f"✗ Playwright approach failed")
600
+
601
+ except Exception as e:
602
+ last_error = f"Playwright error: {type(e).__name__}: {str(e)[:100]}"
603
+ logger.warning(f"✗ {last_error}")
604
+
605
+ # If all approaches failed, return features with None values
606
+ if soup is None:
607
+ error_msg = last_error if last_error else "Unknown error occurred"
608
+ logger.error(f" ✗ Failed to extract features from {processed_url}: {error_msg}")
609
+ print(f" ✗ Failed to extract features: {error_msg}")
610
+ return features
611
+
612
+ # Successfully fetched content, now extract features
613
+ # Use np.nan for parsing errors, 0/1 for missing/present elements
614
+ # Each feature extraction includes retry logic for robustness
615
+
616
+ # 1. has_title
617
+ features['has_title'] = extract_feature_with_retry(
618
+ soup, 'has_title',
619
+ lambda s: 1 if s.find('title') else 0
620
+ )
621
+
622
+ # 2. has_input
623
+ features['has_input'] = extract_feature_with_retry(
624
+ soup, 'has_input',
625
+ lambda s: 1 if s.find('input') else 0
626
+ )
627
+
628
+ # 3. has_button
629
+ features['has_button'] = extract_feature_with_retry(
630
+ soup, 'has_button',
631
+ lambda s: 1 if s.find('button') else 0
632
+ )
633
+
634
+ # 4. has_image
635
+ features['has_image'] = extract_feature_with_retry(
636
+ soup, 'has_image',
637
+ lambda s: 1 if s.find('img') else 0
638
+ )
639
+
640
+ # 5. has_submit
641
+ features['has_submit'] = extract_feature_with_retry(
642
+ soup, 'has_submit',
643
+ lambda s: 1 if s.find('input', {'type': 'submit'}) else 0
644
+ )
645
+
646
+ # 6. has_link
647
+ features['has_link'] = extract_feature_with_retry(
648
+ soup, 'has_link',
649
+ lambda s: 1 if s.find('a') else 0
650
+ )
651
+
652
+ # 7. has_password
653
+ features['has_password'] = extract_feature_with_retry(
654
+ soup, 'has_password',
655
+ lambda s: 1 if s.find('input', {'type': 'password'}) else 0
656
+ )
657
+
658
+ # 8. has_email_input
659
+ features['has_email_input'] = extract_feature_with_retry(
660
+ soup, 'has_email_input',
661
+ lambda s: 1 if s.find('input', {'type': 'email'}) else 0
662
+ )
663
+
664
+ # 9. has_hidden_element
665
+ features['has_hidden_element'] = extract_feature_with_retry(
666
+ soup, 'has_hidden_element',
667
+ lambda s: 1 if s.find('input', {'type': 'hidden'}) else 0
668
+ )
669
+
670
+ # 10. has_audio
671
+ features['has_audio'] = extract_feature_with_retry(
672
+ soup, 'has_audio',
673
+ lambda s: 1 if s.find('audio') else 0
674
+ )
675
+
676
+ # 11. has_video
677
+ features['has_video'] = extract_feature_with_retry(
678
+ soup, 'has_video',
679
+ lambda s: 1 if s.find('video') else 0
680
+ )
681
+
682
+ # 12. number_of_inputs
683
+ features['number_of_inputs'] = extract_feature_with_retry(
684
+ soup, 'number_of_inputs',
685
+ lambda s: len(s.find_all('input'))
686
+ )
687
+
688
+ # 13. number_of_buttons
689
+ features['number_of_buttons'] = extract_feature_with_retry(
690
+ soup, 'number_of_buttons',
691
+ lambda s: len(s.find_all('button'))
692
+ )
693
+
694
+ # 14. number_of_images
695
+ features['number_of_images'] = extract_feature_with_retry(
696
+ soup, 'number_of_images',
697
+ lambda s: len(s.find_all('img'))
698
+ )
699
+
700
+ # 15. number_of_option
701
+ features['number_of_option'] = extract_feature_with_retry(
702
+ soup, 'number_of_option',
703
+ lambda s: len(s.find_all('option'))
704
+ )
705
+
706
+ # 16. number_of_list
707
+ features['number_of_list'] = extract_feature_with_retry(
708
+ soup, 'number_of_list',
709
+ lambda s: len(s.find_all('li'))
710
+ )
711
+
712
+ # 17. number_of_th
713
+ features['number_of_th'] = extract_feature_with_retry(
714
+ soup, 'number_of_th',
715
+ lambda s: len(s.find_all('th'))
716
+ )
717
+
718
+ # 18. number_of_tr
719
+ features['number_of_tr'] = extract_feature_with_retry(
720
+ soup, 'number_of_tr',
721
+ lambda s: len(s.find_all('tr'))
722
+ )
723
+
724
+ # 19. number_of_href
725
+ features['number_of_href'] = extract_feature_with_retry(
726
+ soup, 'number_of_href',
727
+ lambda s: len(s.find_all('a', href=True))
728
+ )
729
+
730
+ # 20. number_of_paragraph
731
+ features['number_of_paragraph'] = extract_feature_with_retry(
732
+ soup, 'number_of_paragraph',
733
+ lambda s: len(s.find_all('p'))
734
+ )
735
+
736
+ # 21. number_of_script
737
+ features['number_of_script'] = extract_feature_with_retry(
738
+ soup, 'number_of_script',
739
+ lambda s: len(s.find_all('script'))
740
+ )
741
+
742
+ # 22. length_of_title
743
+ def extract_title_length(s):
744
+ title_tag = s.find('title')
745
+ return len(title_tag.get_text()) if title_tag else 0
746
+
747
+ features['length_of_title'] = extract_feature_with_retry(
748
+ soup, 'length_of_title',
749
+ extract_title_length
750
+ )
751
+
752
+ # 23. has_h1
753
+ features['has_h1'] = extract_feature_with_retry(
754
+ soup, 'has_h1',
755
+ lambda s: 1 if s.find('h1') else 0
756
+ )
757
+
758
+ # 24. has_h2
759
+ features['has_h2'] = extract_feature_with_retry(
760
+ soup, 'has_h2',
761
+ lambda s: 1 if s.find('h2') else 0
762
+ )
763
+
764
+ # 25. has_h3
765
+ features['has_h3'] = extract_feature_with_retry(
766
+ soup, 'has_h3',
767
+ lambda s: 1 if s.find('h3') else 0
768
+ )
769
+
770
+ # 26. length_of_text
771
+ def extract_text_length(s):
772
+ # Create a copy to avoid modifying the original soup
773
+ soup_copy = BeautifulSoup(str(s), 'html.parser')
774
+ for script_or_style in soup_copy(['script', 'style']):
775
+ script_or_style.decompose()
776
+ body = soup_copy.find('body')
777
+ if body:
778
+ text = body.get_text()
779
+ return len(text)
780
+ return 0
781
+
782
+ features['length_of_text'] = extract_feature_with_retry(
783
+ soup, 'length_of_text',
784
+ extract_text_length
785
+ )
786
+
787
+ # 27. number_of_clickable_button
788
+ def extract_clickable_buttons(s):
789
+ buttons = len(s.find_all('button'))
790
+ input_buttons = len(s.find_all('input', {'type': ['button', 'submit', 'reset']}))
791
+ return buttons + input_buttons
792
+
793
+ features['number_of_clickable_button'] = extract_feature_with_retry(
794
+ soup, 'number_of_clickable_button',
795
+ extract_clickable_buttons
796
+ )
797
+
798
+ # 28. number_of_a
799
+ features['number_of_a'] = extract_feature_with_retry(
800
+ soup, 'number_of_a',
801
+ lambda s: len(s.find_all('a'))
802
+ )
803
+
804
+ # 29. number_of_img
805
+ features['number_of_img'] = extract_feature_with_retry(
806
+ soup, 'number_of_img',
807
+ lambda s: len(s.find_all('img'))
808
+ )
809
+
810
+ # 30. number_of_div
811
+ features['number_of_div'] = extract_feature_with_retry(
812
+ soup, 'number_of_div',
813
+ lambda s: len(s.find_all('div'))
814
+ )
815
+
816
+ # 31. number_of_figure
817
+ features['number_of_figure'] = extract_feature_with_retry(
818
+ soup, 'number_of_figure',
819
+ lambda s: len(s.find_all('figure'))
820
+ )
821
+
822
+ # 32. has_footer
823
+ features['has_footer'] = extract_feature_with_retry(
824
+ soup, 'has_footer',
825
+ lambda s: 1 if s.find('footer') else 0
826
+ )
827
+
828
+ # 33. has_form
829
+ features['has_form'] = extract_feature_with_retry(
830
+ soup, 'has_form',
831
+ lambda s: 1 if s.find('form') else 0
832
+ )
833
+
834
+ # 34. has_text_area
835
+ features['has_text_area'] = extract_feature_with_retry(
836
+ soup, 'has_text_area',
837
+ lambda s: 1 if s.find('textarea') else 0
838
+ )
839
+
840
+ # 35. has_iframe
841
+ features['has_iframe'] = extract_feature_with_retry(
842
+ soup, 'has_iframe',
843
+ lambda s: 1 if s.find('iframe') else 0
844
+ )
845
+
846
+ # 36. has_text_input
847
+ features['has_text_input'] = extract_feature_with_retry(
848
+ soup, 'has_text_input',
849
+ lambda s: 1 if s.find('input', {'type': 'text'}) else 0
850
+ )
851
+
852
+ # 37. number_of_meta
853
+ features['number_of_meta'] = extract_feature_with_retry(
854
+ soup, 'number_of_meta',
855
+ lambda s: len(s.find_all('meta'))
856
+ )
857
+
858
+ # 38. has_nav
859
+ features['has_nav'] = extract_feature_with_retry(
860
+ soup, 'has_nav',
861
+ lambda s: 1 if s.find('nav') else 0
862
+ )
863
+
864
+ # 39. has_object
865
+ features['has_object'] = extract_feature_with_retry(
866
+ soup, 'has_object',
867
+ lambda s: 1 if s.find('object') else 0
868
+ )
869
+
870
+ # 40. has_picture
871
+ features['has_picture'] = extract_feature_with_retry(
872
+ soup, 'has_picture',
873
+ lambda s: 1 if s.find('picture') else 0
874
+ )
875
+
876
+ # 41. number_of_sources
877
+ features['number_of_sources'] = extract_feature_with_retry(
878
+ soup, 'number_of_sources',
879
+ lambda s: len(s.find_all('source'))
880
+ )
881
+
882
+ # 42. number_of_span
883
+ features['number_of_span'] = extract_feature_with_retry(
884
+ soup, 'number_of_span',
885
+ lambda s: len(s.find_all('span'))
886
+ )
887
+
888
+ # 43. number_of_table
889
+ features['number_of_table'] = extract_feature_with_retry(
890
+ soup, 'number_of_table',
891
+ lambda s: len(s.find_all('table'))
892
+ )
893
+
894
+ # Clean up Playwright resources if they were created
895
+ if playwright_resources is not None:
896
+ try:
897
+ playwright_instance, browser, context, page = playwright_resources
898
+ if page:
899
+ page.close()
900
+ if context:
901
+ context.close()
902
+ if browser:
903
+ browser.close()
904
+ if playwright_instance:
905
+ playwright_instance.stop()
906
+ logger.debug("Playwright resources closed successfully")
907
+ except Exception as e:
908
+ logger.debug(f"Error closing Playwright resources: {e}")
909
+
910
+ # Count successfully extracted features
911
+ # Features with value >= 0 are successfully extracted, -1 indicates failure
912
+ successful_features = sum(1 for v in features.values() if isinstance(v, int) and v >= 0)
913
+ failed_features = sum(1 for v in features.values() if v == -1)
914
+
915
+ if failed_features > 0:
916
+ logger.warning(f"⚠ Extracted {successful_features}/43 features from {processed_url} ({failed_features} failed)")
917
+ else:
918
+ logger.info(f"✓ Successfully extracted all 43 features from {processed_url}")
919
+
920
+ return features
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI and server
2
+ fastapi==0.115.0
3
+ uvicorn[standard]==0.32.0
4
+ pydantic==2.9.2
5
+ huggingface_hub== 0.35.3
6
+
7
+ # Data processing
8
+ pandas==2.2.2
9
+ numpy==2.0.2
10
+
11
+ # Machine learning
12
+ scikit-learn==1.6.1
13
+ lightgbm==4.6.0
14
+ xgboost==3.1.2
15
+ catboost==1.2.8
16
+ joblib==1.5.2
17
+
18
+ # Feature extraction dependencies
19
+ requests==2.32.3
20
+ beautifulsoup4==4.12.3
21
+ urllib3==2.2.3
22
+ playwright==1.48.0
run.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple script to run the FastAPI application.
3
+ """
4
+
5
+ import uvicorn
6
+ import sys
7
+ import os
8
+
9
+ def main():
10
+ """Run the FastAPI application."""
11
+ # Get port from environment variable or use default
12
+ port = int(os.getenv("PORT", "8000"))
13
+
14
+ # Get host from environment variable or use default
15
+ host = os.getenv("HOST", "0.0.0.0")
16
+
17
+ # Check if reload flag is passed
18
+ reload = "--reload" in sys.argv or "-r" in sys.argv
19
+
20
+ print("="*60)
21
+ print("🔒 Phishing URL Detection API")
22
+ print("="*60)
23
+ print(f"Starting server on {host}:{port}")
24
+ print(f"Reload mode: {'Enabled' if reload else 'Disabled'}")
25
+ print(f"\nAPI Documentation:")
26
+ print(f" - Swagger UI: http://{host if host != '0.0.0.0' else 'localhost'}:{port}/docs")
27
+ print(f" - ReDoc: http://{host if host != '0.0.0.0' else 'localhost'}:{port}/redoc")
28
+ print("="*60)
29
+
30
+ uvicorn.run(
31
+ "main:app",
32
+ host=host,
33
+ port=port,
34
+ reload=reload,
35
+ log_level="info"
36
+ )
37
+
38
+ if __name__ == "__main__":
39
+ main()
40
+