Spaces:

xxemrzru
/

url-phish-fastapi

Runtime error

App Files Files Community

Rasel Santillan commited on Dec 29, 2025

Commit

8a9ac80

0 Parent(s):

Squashed clean history

Browse files

Files changed (12) hide show

.dockerignore +83 -0
.gitattributes +35 -0
Dockerfile +73 -0
README.md +206 -0
app.py +28 -0
categorization.py +103 -0
main.py +305 -0
model/__init__.py +8 -0
model/model.py +298 -0
model/url_feature_extractor.py +920 -0
requirements.txt +22 -0
run.py +40 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,83 @@

+# Python cache and compiled files
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# Virtual environments
+venv/
+env/
+ENV/
+.venv/
+virtualenv/
+# IDE and editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+# Git
+.git/
+.gitignore
+.gitattributes
+# Jupyter notebooks
+*.ipynb
+.ipynb_checkpoints/
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+.tox/
+*.cover
+# Documentation
+*.md
+docs/
+README.md
+# Logs
+*.log
+logs/
+# Environment variables
+.env
+.env.local
+.env.*.local
+# Dataset and training files
+data/
+datasets/
+*.csv
+*.xlsx
+*.json
+# Model training artifacts (keep only the final model)
+checkpoints/
+experiments/
+mlruns/
+# Development dependencies
+requirements-dev.txt
+setup.py
+setup.cfg
+# CI/CD
+.github/
+.gitlab-ci.yml
+.travis.yml
+# Docker
+Dockerfile.dev
+docker-compose.yml
+docker-compose.*.yml
+# Misc
+*.bak
+*.tmp
+.cache/

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,73 @@

+# Dockerfile for Phishing URL Detection FastAPI Application
+# Base image: Python 3.10 slim for smaller image size
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+# Install system dependencies required for ML libraries and Playwright
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    wget \
+    # Playwright/Chromium dependencies
+    libnss3 \
+    libnspr4 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libdbus-1-3 \
+    libxkbcommon0 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxfixes3 \
+    libxrandr2 \
+    libgbm1 \
+    libpango-1.0-0 \
+    libcairo2 \
+    libasound2 \
+    libatspi2.0-0 \
+    libxshmfence1 \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user for security
+RUN useradd -m -u 1000 user && \
+    mkdir -p /app && \
+    chown -R user:user /app
+# Set working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY --chown=user:user requirements.txt .
+# Switch to non-root user
+USER user
+# Add user's local bin to PATH
+ENV PATH="/home/user/.local/bin:$PATH"
+# Install Python dependencies
+RUN pip install --user --no-cache-dir --upgrade pip && \
+    pip install --user --no-cache-dir -r requirements.txt
+# Install Playwright browsers (as user)
+# System dependencies are already installed above, so we just need the browser binaries
+RUN python -m playwright install chromium
+# Copy application code and model
+COPY --chown=user:user . .
+# Expose ports (7860 is default, 8000 for compatibility)
+EXPOSE 7860 8000
+# Health check (uses port 7860 by default)
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/health')" || exit 1
+# Run the application
+# Use app.py for HuggingFace Spaces compatibility, defaults to port 7860
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,206 @@

+---
+title: Phishing URL Detection API
+emoji: 🔒
+colorFrom: red
+colorTo: yellow
+sdk: docker
+pinned: false
+license: mit
+app_port: 7860
+---
+# Phishing URL Detection API
+A FastAPI-based REST API for detecting phishing URLs using machine learning. This service analyzes URL features and webpage content to classify URLs as legitimate or phishing attempts.
+## Features
+- 🔍 **Real-time URL Analysis**: Extracts 43 features from URLs and their webpages
+- 🤖 **Machine Learning**: Uses a stacking ensemble model for accurate predictions
+- 🚀 **Fast API**: Built with FastAPI for high performance and automatic documentation
+- 🐳 **Docker Support**: Containerized for easy deployment
+- 📊 **Confidence Scores**: Returns prediction confidence for better decision-making
+- 🔒 **CORS Enabled**: Accessible from web browsers
+## Project Structure
+```
+url-phish-fastapi/
+├── main.py                          # FastAPI application
+├── model/
+│   ├── __init__.py                  # Package initialization
+│   ├── model.py                     # Model loading and prediction logic
+│   ├── url_feature_extractor.py     # Feature extraction from URLs
+│   └── url_stacking_model.joblib    # Pre-trained ML model
+├── requirements.txt                 # Python dependencies
+├── Dockerfile                       # Docker configuration
+├── .dockerignore                    # Docker ignore patterns
+└── README.md                        # This file
+```
+## API Endpoints
+### Health Check
+- **GET** `/` - Root endpoint
+- **GET** `/health` - Health check endpoint
+### Prediction
+- **POST** `/predict` - Analyze a URL for phishing detection
+**Request Body:**
+```json
+{
+  "url": "http://example.com"
+}
+```
+**Response:**
+```json
+{
+  "url": "http://example.com",
+  "prediction": "legitimate",
+  "confidence": 0.95,
+  "predicted_label": 0,
+  "phish_probability": 0.05
+}
+```
+### Interactive Documentation
+- **Swagger UI**: `http://localhost:7860/docs`
+- **ReDoc**: `http://localhost:7860/redoc`
+## Installation & Usage
+### Option 1: Local Development
+1. **Install dependencies:**
+```bash
+pip install -r requirements.txt
+```
+2. **Run the application:**
+```bash
+python app.py
+```
+3. **Access the API:**
+- API: http://localhost:7860
+- Docs: http://localhost:7860/docs
+### Option 2: Docker (Recommended)
+1. **Build the Docker image:**
+```bash
+docker build -t phishing-url-api .
+```
+2. **Run the container:**
+```bash
+docker run -p 7860:7860 phishing-url-api
+```
+3. **Access the API:**
+- API: http://localhost:7860
+- Docs: http://localhost:7860/docs
+### Option 3: Docker with Custom Port
+```bash
+docker run -p 8000:8000 -e PORT=8000 phishing-url-api
+```
+## Testing
+Run the test script to verify the API is working:
+```bash
+python test_api.py
+```
+Or use curl:
+```bash
+# Health check
+curl http://localhost:7860/health
+# Predict URL
+curl -X POST http://localhost:7860/predict \
+  -H "Content-Type: application/json" \
+  -d '{"url": "https://www.google.com"}'
+```
+## Model Information
+The API uses a **stacking ensemble model** that combines multiple base classifiers:
+- Random Forest
+- Gradient Boosting
+- XGBoost
+- LightGBM
+- Logistic Regression (meta-model)
+### Features Extracted (43 total)
+The model analyzes various HTML elements and webpage characteristics:
+- Form elements (inputs, buttons, password fields)
+- Media elements (images, videos, audio)
+- Structural elements (divs, tables, lists)
+- Content metrics (text length, title length)
+- Interactive elements (links, scripts, iframes)
+## Dependencies
+- **FastAPI**: Web framework
+- **Uvicorn**: ASGI server
+- **Scikit-learn**: Machine learning
+- **Pandas/NumPy**: Data processing
+- **BeautifulSoup4**: HTML parsing
+- **Requests**: HTTP requests
+- **XGBoost/LightGBM**: Gradient boosting models
+## Error Handling
+The API handles various error scenarios:
+- **400 Bad Request**: Invalid or empty URL
+- **500 Internal Server Error**: Model loading or prediction failures
+- **Unknown Prediction**: When URL is unreachable or feature extraction fails
+## Performance Considerations
+- Model is loaded once on startup (singleton pattern)
+- Feature extraction may take 5-10 seconds for live URLs
+- Unreachable URLs return "unknown" prediction
+- HTTPS verification is disabled for broader compatibility
+## Security Notes
+- The API makes HTTP requests to analyze URLs
+- SSL verification is disabled for feature extraction
+- Use appropriate network security when deploying
+- Consider rate limiting for production use
+## Deployment
+### HuggingFace Spaces
+This project is configured for deployment on HuggingFace Spaces using Docker SDK.
+### Other Platforms
+The Docker container can be deployed on:
+- AWS ECS/Fargate
+- Google Cloud Run
+- Azure Container Instances
+- Kubernetes
+- Any Docker-compatible platform
+## License
+[Add your license information here]
+## Contributing
+[Add contribution guidelines here]
+## Support
+For issues and questions, please [create an issue](https://github.com/yourusername/yourrepo/issues).

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Entry point for Hugging Face Spaces deployment.
+This file is required by Hugging Face Spaces and must be named 'app.py'.
+"""
+import os
+import uvicorn
+from main import app
+if __name__ == "__main__":
+    # Default to port 7860 (Hugging Face Spaces standard)
+    port = int(os.getenv("PORT", "7860"))
+    host = os.getenv("HOST", "0.0.0.0")
+    print("="*60)
+    print("🔒 Phishing URL Detection API")
+    print("="*60)
+    print(f"Starting server on {host}:{port}")
+    print(f"API Documentation: http://{host if host != '0.0.0.0' else 'localhost'}:{port}/docs")
+    print("="*60)
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        log_level="info"
+    )

categorization.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Risk categorization module for phishing detection results.
+This module provides functions to categorize phishing probability scores
+into risk categories and binary classifications.
+"""
+from enum import Enum
+from typing import Tuple
+class RiskCategory(str, Enum):
+    """Risk category based on phishing probability score."""
+    SAFE = "Safe"
+    LOW = "Low"
+    MODERATE = "Moderate"
+    HIGH = "Dangerous"
+    CRITICAL = "Critical"
+class BinaryClassification(str, Enum):
+    """Binary classification of phishing detection result."""
+    LEGITIMATE = "Legitimate"
+    PHISHING = "Phishing"
+# Risk category thresholds (score is 0-100 scale)
+RISK_THRESHOLDS = {
+    RiskCategory.SAFE: (0, 25),        # score < 25
+    RiskCategory.LOW: (25, 50),        # 25 <= score < 50
+    RiskCategory.MODERATE: (50, 70),   # 50 <= score < 70
+    RiskCategory.HIGH: (70, 85),       # 70 <= score < 85
+    RiskCategory.CRITICAL: (85, 101),  # score >= 85
+}
+# Binary classification threshold
+PHISHING_THRESHOLD = 70  # score >= 70 is classified as Phishing
+def get_risk_category(phish_probability_score: float) -> RiskCategory:
+    """
+    Determine the risk category based on phishing probability score.
+    Args:
+        phish_probability_score: Phishing probability score (0-100 scale)
+    Returns:
+        RiskCategory: The corresponding risk category
+    """
+    if phish_probability_score < 25:
+        return RiskCategory.SAFE
+    elif phish_probability_score < 50:
+        return RiskCategory.LOW
+    elif phish_probability_score < 70:
+        return RiskCategory.MODERATE
+    elif phish_probability_score < 85:
+        return RiskCategory.HIGH
+    else:
+        return RiskCategory.CRITICAL
+def get_binary_classification(phish_probability_score: float) -> BinaryClassification:
+    """
+    Determine the binary classification based on phishing probability score.
+    Args:
+        phish_probability_score: Phishing probability score (0-100 scale)
+    Returns:
+        BinaryClassification: Legitimate if score < 70, Phishing otherwise
+    """
+    if phish_probability_score < PHISHING_THRESHOLD:
+        return BinaryClassification.LEGITIMATE
+    else:
+        return BinaryClassification.PHISHING
+def categorize_phishing_result(phish_probability: float) -> Tuple[RiskCategory, BinaryClassification, float]:
+    """
+    Categorize a phishing detection result.
+    This function takes a phishing probability (0-1 scale) and returns:
+    - Risk category (Safe, Low, Moderate, Dangerous, Critical)
+    - Binary classification (Legitimate or Phishing)
+    - The probability score on a 0-100 scale
+    Args:
+        phish_probability: Phishing probability from the model (0-1 scale)
+    Returns:
+        Tuple containing:
+        - RiskCategory: The risk category
+        - BinaryClassification: The binary classification
+        - float: The probability score on 0-100 scale
+    """
+    # Convert from 0-1 scale to 0-100 scale
+    score_100 = phish_probability * 100
+    risk_category = get_risk_category(score_100)
+    binary_classification = get_binary_classification(score_100)
+    return risk_category, binary_classification, score_100

main.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+FastAPI application for phishing URL detection.
+"""
+from fastapi import FastAPI, HTTPException, status
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, validator
+from typing import Optional
+import logging
+from model.model import predict_url, load_model, get_meta_features_and_update
+from categorization import categorize_phishing_result, RiskCategory, BinaryClassification
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Create FastAPI application
+app = FastAPI(
+    title="Phishing URL Detection API",
+    description="API for detecting phishing URLs using machine learning. Analyzes URL features to classify URLs as legitimate or phishing attempts.",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc"
+)
+# Configure CORS middleware to allow web browser access
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with specific origins
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Pydantic models for request/response validation
+class URLRequest(BaseModel):
+    """Request model for URL prediction."""
+    url: str = Field(
+        ...,
+        description="The URL to analyze for phishing detection",
+        example="http://example.com"
+    )
+    @validator('url')
+    def validate_url(cls, v):
+        """Validate that URL is not empty."""
+        if not v or not v.strip():
+            raise ValueError("URL cannot be empty")
+        return v.strip()
+class PredictionResponse(BaseModel):
+    """Response model for URL prediction."""
+    url: str = Field(..., description="The analyzed URL")
+    prediction: str = Field(..., description="Prediction result: 'phishing', 'legitimate', or 'unknown'")
+    confidence: float = Field(..., description="Confidence score (0-1)")
+    predicted_label: int = Field(..., description="Predicted label: 0 (legitimate), 1 (phishing), -1 (unknown)")
+    phish_probability: float = Field(..., description="Probability of being phishing (0-1)")
+    phish_probability_percent: float = Field(..., description="Probability of being phishing (0-100 scale)")
+    risk_category: str = Field(..., description="Risk category: 'Safe', 'Low', 'Moderate', 'Dangerous', or 'Critical'")
+    binary_classification: str = Field(..., description="Binary classification: 'Legitimate' or 'Phishing'")
+    error: Optional[str] = Field(None, description="Error message if prediction failed")
+class HealthResponse(BaseModel):
+    """Response model for health check."""
+    status: str = Field(..., description="Service status")
+    message: str = Field(..., description="Status message")
+class UpdateRequest(BaseModel):
+    """Request model for online learning update."""
+    url: str = Field(..., description="The URL that was misclassified")
+    true_label: int = Field(..., description="True label: 0 (legitimate) or 1 (phishing)")
+    @validator('true_label')
+    def validate_label(cls, v):
+        """Validate that true_label is 0 or 1."""
+        if v not in [0, 1]:
+            raise ValueError("true_label must be 0 (legitimate) or 1 (phishing)")
+        return v
+class UpdateResponse(BaseModel):
+    """Response model for online learning update."""
+    status: str = Field(..., description="Update status")
+    message: str = Field(..., description="Update message")
+    url: str = Field(..., description="The URL that was updated")
+    true_label: int = Field(..., description="The true label used for update")
+    meta_features: Optional[list] = Field(None, description="Meta features used for update")
+# API Endpoints
+@app.get("/", response_model=HealthResponse, tags=["Health"])
+async def root():
+    """
+    Root endpoint - Health check.
+    Returns:
+        HealthResponse: Service status information
+    """
+    return HealthResponse(
+        status="healthy",
+        message="Phishing URL Detection API is running"
+    )
+@app.get("/health", response_model=HealthResponse, tags=["Health"])
+async def health_check():
+    """
+    Health check endpoint.
+    Returns:
+        HealthResponse: Service status information
+    """
+    return HealthResponse(
+        status="healthy",
+        message="Service is operational"
+    )
+@app.post("/predict", response_model=PredictionResponse, tags=["Prediction"])
+async def predict(request: URLRequest):
+    """
+    Predict whether a URL is phishing or legitimate.
+    This endpoint:
+    1. Validates the input URL
+    2. Extracts features from the URL and its webpage
+    3. Uses a machine learning model to classify the URL
+    4. Returns the prediction with confidence score
+    Args:
+        request: URLRequest containing the URL to analyze
+    Returns:
+        PredictionResponse: Prediction result with confidence score
+    Raises:
+        HTTPException: 400 for invalid input, 500 for server errors
+    """
+    try:
+        logger.info(f"Received prediction request for URL: {request.url}")
+        # Validate URL is not empty (already done by Pydantic validator)
+        if not request.url:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="URL cannot be empty"
+            )
+        # Call prediction function
+        result = predict_url(request.url)
+        # Add risk categorization
+        risk_category, binary_classification, score_100 = categorize_phishing_result(
+            result['phish_probability']
+        )
+        result['phish_probability_percent'] = score_100
+        result['risk_category'] = risk_category.value
+        result['binary_classification'] = binary_classification.value
+        logger.info(f"Prediction successful: {result['prediction']} | Risk: {risk_category.value} | Classification: {binary_classification.value}")
+        return PredictionResponse(**result)
+    except ValueError as e:
+        # Handle validation errors
+        logger.error(f"Validation error: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Invalid input: {str(e)}"
+        )
+    except FileNotFoundError as e:
+        # Handle model file not found
+        logger.error(f"Model file not found: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Model file not found. Please ensure the model is properly deployed."
+        )
+    except Exception as e:
+        # Handle all other errors
+        logger.error(f"Prediction error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An error occurred during prediction: {str(e)}"
+        )
+@app.post("/update", response_model=UpdateResponse, tags=["Update"])
+async def update_model(request: UpdateRequest):
+    """
+    Update the meta model using online learning with partial_fit.
+    This endpoint:
+    1. Extracts features from the URL
+    2. Generates meta-features using base models
+    3. Updates the SGD meta model with partial_fit
+    4. Saves the updated model
+    Args:
+        request: UpdateRequest containing URL and true label
+    Returns:
+        UpdateResponse: Update status and meta features used
+    Raises:
+        HTTPException: 400 for invalid input, 500 for server errors
+    """
+    try:
+        logger.info(f"Received update request for URL: {request.url} with label: {request.true_label}")
+        # Validate inputs
+        if not request.url or not request.url.strip():
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="URL cannot be empty"
+            )
+        if request.true_label not in [0, 1]:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="true_label must be 0 (legitimate) or 1 (phishing)"
+            )
+        # Get meta features and update model
+        meta_features, updated = get_meta_features_and_update(request.url, request.true_label)
+        if not updated:
+            logger.warning(f"Failed to update model for URL: {request.url}")
+            return UpdateResponse(
+                status="failed",
+                message="Failed to update model - feature extraction may have failed",
+                url=request.url,
+                true_label=request.true_label,
+                meta_features=None
+            )
+        logger.info(f"✅ Model updated successfully for URL: {request.url}")
+        return UpdateResponse(
+            status="success",
+            message="Meta model updated successfully with partial_fit",
+            url=request.url,
+            true_label=request.true_label,
+            meta_features=meta_features.tolist() if meta_features is not None else None
+        )
+    except ValueError as e:
+        logger.error(f"Validation error in update: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Invalid input: {str(e)}"
+        )
+    except Exception as e:
+        logger.error(f"Update error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An error occurred during model update: {str(e)}"
+        )
+# Startup event
+@app.on_event("startup")
+async def startup_event():
+    """
+    Startup event handler.
+    Loads the model on application startup to ensure it's ready.
+    """
+    try:
+        logger.info("Starting up Phishing URL Detection API...")
+        from model.model import load_model
+        load_model()  # Pre-load model on startup
+        logger.info("✅ Model loaded successfully on startup")
+    except Exception as e:
+        logger.error(f"��� Failed to load model on startup: {str(e)}")
+        # Don't prevent startup, but log the error
+# Shutdown event
+@app.on_event("shutdown")
+async def shutdown_event():
+    """
+    Shutdown event handler.
+    """
+    logger.info("Shutting down Phishing URL Detection API...")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "main:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True,
+        log_level="info"
+    )

model/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Model package for phishing URL detection.
+"""
+from .model import predict_url, load_model
+__all__ = ['predict_url', 'load_model']

model/model.py ADDED Viewed

	@@ -0,0 +1,298 @@

+"""
+Model loading and prediction module for phishing URL detection.
+"""
+import logging
+import numpy as np
+import pandas as pd
+import joblib
+from typing import Dict, Any, Optional, Tuple
+import warnings
+from huggingface_hub import hf_hub_download
+# Import feature extraction function
+from .url_feature_extractor import extract_features
+warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global variable to cache the loaded model (singleton pattern)
+_model_cache: Optional[Dict[str, Any]] = None
+def get_model_path() -> str:
+    """
+    Download the model from Hugging Face Hub and return the local path.
+    Returns:
+        str: Local path to the downloaded model file
+    """
+    model_path = hf_hub_download(
+        repo_id="xxemrzru/url-stacking-model",
+        filename="url_stacking_model.joblib"
+    )
+    return model_path
+def load_model() -> Dict[str, Any]:
+    """
+    Load the saved stacking model from file.
+    Uses singleton pattern to load model only once.
+    Returns:
+        dict: Dictionary containing model components:
+            - base_models: Dictionary of base models
+            - meta_model: Final meta model
+            - feature_names: List of feature names
+            - model_names: List of base model names
+    Raises:
+        FileNotFoundError: If model file doesn't exist
+        Exception: If model loading fails
+    """
+    global _model_cache
+    # Return cached model if already loaded
+    if _model_cache is not None:
+        logger.info("Using cached model")
+        return _model_cache
+    try:
+        model_path = get_model_path()
+        logger.info(f"Loading model from: {model_path}")
+        model_data = joblib.load(model_path)
+        # Cache the model
+        _model_cache = {
+            "base_models": model_data["base_models"],
+            "meta_model": model_data["meta_model"],
+            "feature_names": model_data["feature_names"],
+            "model_names": model_data["model_names"]
+        }
+        logger.info("✅ Model loaded successfully")
+        return _model_cache
+    except Exception as e:
+        logger.error(f"❌ Failed to load model: {str(e)}")
+        raise
+def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Make predictions given a dictionary of extracted features.
+    Args:
+        features_dict: Dictionary where keys are feature names and values are feature values
+        model_components: The loaded components returned by load_model()
+    Returns:
+        dict: Contains 'predicted_label' (0 or 1) and 'phish_probability' (float)
+    Raises:
+        ValueError: If required features are missing
+    """
+    base_models = model_components["base_models"]
+    meta_model = model_components["meta_model"]
+    feature_names = model_components["feature_names"]
+    model_names = model_components["model_names"]
+    # Convert to DataFrame to ensure shape consistency
+    X = pd.DataFrame([features_dict])
+    # Ensure all required columns exist
+    missing_cols = set(feature_names) - set(X.columns)
+    if missing_cols:
+        raise ValueError(f"❌ Missing required features: {missing_cols}")
+    # Keep only known features and order them correctly
+    X = X[feature_names]
+    # Level 0: Base model predictions
+    meta_features = np.zeros((X.shape[0], len(base_models)))
+    for idx, (model_name, model) in enumerate(base_models.items()):
+        meta_features[:, idx] = model.predict_proba(X)[:, 1]
+    meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
+    # Level 1: Meta-model prediction
+    final_pred = meta_model.predict(meta_features_df)[0]
+    final_prob = meta_model.predict_proba(meta_features_df)[:, 1][0]
+    return {
+        "predicted_label": int(final_pred),
+        "phish_probability": float(final_prob)
+    }
+def predict_url(url: str) -> Dict[str, Any]:
+    """
+    Main prediction function that takes a raw URL and returns prediction.
+    This function:
+    1. Loads the model (cached after first load)
+    2. Extracts features from the URL using url_feature_extractor
+    3. Makes prediction using the stacking model
+    Args:
+        url: Raw URL string to analyze
+    Returns:
+        dict: Prediction result containing:
+            - url: The input URL
+            - prediction: "phishing" or "legitimate"
+            - confidence: Probability score (0-1)
+            - predicted_label: 0 (legitimate) or 1 (phishing)
+            - phish_probability: Same as confidence
+    Raises:
+        Exception: If feature extraction or prediction fails
+    """
+    try:
+        # Load model (uses cache if already loaded)
+        model_components = load_model()
+        # Extract features from URL
+        logger.info(f"Extracting features from URL: {url}")
+        features_dict = extract_features(url)
+        # Check if feature extraction failed (all -1 values indicate extraction failure)
+        if all(v == -1 for v in features_dict.values()):
+            logger.warning(f"Feature extraction failed for URL: {url}")
+            # Return a default prediction for unreachable URLs
+            return {
+                "url": url,
+                "prediction": "unknown",
+                "confidence": 0.0,
+                "predicted_label": -1,
+                "phish_probability": 0.0,
+                "error": "Failed to extract features - URL may be unreachable"
+            }
+        # Make prediction
+        logger.info("Making prediction...")
+        prediction_result = predict_from_features(features_dict, model_components)
+        # Format response
+        predicted_label = prediction_result["predicted_label"]
+        phish_probability = prediction_result["phish_probability"]
+        result = {
+            "url": url,
+            "prediction": "phishing" if predicted_label == 1 else "legitimate",
+            "confidence": phish_probability if predicted_label == 1 else (1 - phish_probability),
+            "predicted_label": predicted_label,
+            "phish_probability": phish_probability
+        }
+        logger.info(f"✅ Prediction complete: {result['prediction']} (confidence: {result['confidence']:.2%})")
+        return result
+    except Exception as e:
+        logger.error(f"❌ Prediction failed: {str(e)}")
+        raise
+def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np.ndarray], bool]:
+    """
+    Extract meta-features from URL and update the SGD meta model using partial_fit.
+    This function:
+    1. Extracts features from the URL
+    2. Generates meta-features using base models (probability outputs)
+    3. Updates the SGD meta model with partial_fit(meta_features, true_label)
+    4. Saves the updated model to disk
+    Args:
+        url: Raw URL string to extract features from
+        true_label: True label (0 for legitimate, 1 for phishing)
+    Returns:
+        tuple: (meta_features_array, success_flag)
+            - meta_features_array: numpy array of meta features used for update
+            - success_flag: boolean indicating if update was successful
+    Raises:
+        Exception: If feature extraction or model update fails
+    """
+    try:
+        # Load model components
+        model_components = load_model()
+        base_models = model_components["base_models"]
+        meta_model = model_components["meta_model"]
+        feature_names = model_components["feature_names"]
+        model_names = model_components["model_names"]
+        # Extract features from URL
+        logger.info(f"Extracting features for update from URL: {url}")
+        features_dict = extract_features(url)
+        # Check if feature extraction failed
+        if all(v == -1 for v in features_dict.values()):
+            logger.warning(f"Feature extraction failed for URL update: {url}")
+            return None, False
+        # Convert to DataFrame and ensure proper ordering
+        X = pd.DataFrame([features_dict])
+        missing_cols = set(feature_names) - set(X.columns)
+        if missing_cols:
+            raise ValueError(f"Missing required features: {missing_cols}")
+        X = X[feature_names]
+        # Generate meta-features using base models (probability outputs)
+        meta_features = np.zeros((X.shape[0], len(base_models)))
+        for idx, (model_name, model) in enumerate(base_models.items()):
+            meta_features[:, idx] = model.predict_proba(X)[:, 1]
+        meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
+        # Update the SGD meta model using partial_fit
+        logger.info(f"Updating meta model with partial_fit for label: {true_label}")
+        meta_model.partial_fit(meta_features_df, [true_label], classes=[0, 1])
+        # Update the cached model with the new meta model
+        global _model_cache
+        if _model_cache is not None:
+            _model_cache["meta_model"] = meta_model
+        # Save the updated model to disk
+        save_updated_model(model_components, meta_model)
+        logger.info(f"✅ Model updated successfully for URL: {url}")
+        return meta_features_df.values[0], True
+    except Exception as e:
+        logger.error(f"❌ Failed to update model: {str(e)}")
+        return None, False
+def save_updated_model(model_components: Dict[str, Any], updated_meta_model) -> None:
+    """
+    Save the updated model components to disk.
+    Args:
+        model_components: Dictionary containing model components
+        updated_meta_model: The updated SGD meta model
+    """
+    try:
+        model_path = get_model_path()
+        # Create updated model data
+        updated_model_data = {
+            "base_models": model_components["base_models"],
+            "meta_model": updated_meta_model,  # Use the updated meta model
+            "feature_names": model_components["feature_names"],
+            "model_names": model_components["model_names"]
+        }
+        # Save to disk
+        joblib.dump(updated_model_data, model_path)
+        logger.info(f"✅ Updated model saved to: {model_path}")
+    except Exception as e:
+        logger.error(f"❌ Failed to save updated model: {str(e)}")
+        raise

model/url_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,920 @@

+"""
+URL Feature Extraction System for Phishing Detection
+Extracts 43 specific features from URLs and their corresponding webpages.
+"""
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from urllib.parse import urlparse
+import warnings
+import time
+import logging
+import numpy as np
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from functools import wraps
+import asyncio
+import sys
+# Playwright imports (optional - graceful degradation if not installed)
+try:
+    from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
+    PLAYWRIGHT_AVAILABLE = True
+except ImportError:
+    PLAYWRIGHT_AVAILABLE = False
+    PlaywrightTimeoutError = Exception  # Fallback for type hints
+warnings.filterwarnings('ignore')
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def _is_running_in_event_loop():
+    """
+    Check if code is running inside an asyncio event loop.
+    Returns:
+        bool: True if running in an event loop, False otherwise
+    """
+    try:
+        asyncio.get_running_loop()
+        return True
+    except RuntimeError:
+        return False
+# Configuration constants
+FEATURE_EXTRACTION_MAX_RETRIES = 3
+FEATURE_EXTRACTION_RETRY_DELAY = 0.3  # seconds between retries
+PAGE_LOAD_TIMEOUT = 20  # seconds to wait for page load
+DYNAMIC_CONTENT_WAIT = 3  # seconds to wait for dynamic content after page load
+def retry_feature_extraction(max_retries=FEATURE_EXTRACTION_MAX_RETRIES, delay=FEATURE_EXTRACTION_RETRY_DELAY):
+    """
+    Decorator to retry feature extraction with exponential backoff.
+    Args:
+        max_retries (int): Maximum number of retry attempts
+        delay (float): Initial delay between retries in seconds
+    Returns:
+        Decorated function with retry logic
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+            for attempt in range(max_retries):
+                try:
+                    result = func(*args, **kwargs)
+                    # If we got a valid result (not np.nan), return it
+                    if result is not None and not (isinstance(result, float) and np.isnan(result)):
+                        return result
+                    # If result is np.nan or None, retry
+                    if attempt < max_retries - 1:
+                        time.sleep(delay * (attempt + 1))  # Exponential backoff
+                except Exception as e:
+                    last_exception = e
+                    if attempt < max_retries - 1:
+                        time.sleep(delay * (attempt + 1))
+                    continue
+            # All retries exhausted, return np.nan
+            if last_exception:
+                logger.debug(f"Feature extraction failed after {max_retries} attempts: {last_exception}")
+            return np.nan
+        return wrapper
+    return decorator
+def create_playwright_browser():
+    """
+    Create a Playwright browser context for dynamic content extraction.
+    Returns:
+        tuple: (playwright instance, browser, context, page) or (None, None, None, None) if failed
+    """
+    if not PLAYWRIGHT_AVAILABLE:
+        logger.warning("Playwright is not installed. Install with: pip install playwright && playwright install")
+        return None, None, None, None
+    try:
+        # Start Playwright
+        playwright = sync_playwright().start()
+        # Launch browser with stealth options
+        browser = playwright.chromium.launch(
+            headless=True,
+            args=[
+                '--no-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-gpu',
+                '--disable-extensions',
+                '--disable-blink-features=AutomationControlled',
+            ]
+        )
+        # Create context with stealth settings
+        context = browser.new_context(
+            viewport={'width': 1920, 'height': 1080},
+            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            locale='en-US',
+            timezone_id='America/New_York',
+            permissions=[],
+            extra_http_headers={
+                'Accept-Language': 'en-US,en;q=0.9',
+                'DNT': '1',
+            },
+            ignore_https_errors=True,
+        )
+        # Add init script to hide webdriver property
+        context.add_init_script("""
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+            // Override the navigator.plugins to avoid detection
+            Object.defineProperty(navigator, 'plugins', {
+                get: () => [1, 2, 3, 4, 5]
+            });
+            // Override the navigator.languages to avoid detection
+            Object.defineProperty(navigator, 'languages', {
+                get: () => ['en-US', 'en']
+            });
+        """)
+        # Create a new page
+        page = context.new_page()
+        # Set default timeout
+        page.set_default_timeout(PAGE_LOAD_TIMEOUT * 1000)  # Convert to milliseconds
+        logger.info("✓ Playwright browser created successfully")
+        return playwright, browser, context, page
+    except Exception as e:
+        logger.warning(f"Failed to create Playwright browser: {type(e).__name__}: {str(e)[:200]}")
+        logger.info("Playwright will be skipped. Install with: pip install playwright && playwright install")
+        return None, None, None, None
+def fetch_page_with_playwright(url, page=None):
+    """
+    Fetch a webpage using Playwright to handle dynamic JavaScript content.
+    Args:
+        url (str): URL to fetch
+        page (playwright.sync_api.Page, optional): Existing page instance
+    Returns:
+        tuple: (BeautifulSoup object, (playwright, browser, context, page)) or (None, None) if failed
+    """
+    resources_created = False
+    playwright_instance = None
+    browser = None
+    context = None
+    try:
+        if page is None:
+            playwright_instance, browser, context, page = create_playwright_browser()
+            resources_created = True
+        if page is None:
+            return None, None
+        logger.info(f"Fetching URL with Playwright: {url}")
+        # Navigate to the URL
+        try:
+            response = page.goto(url, wait_until='networkidle', timeout=PAGE_LOAD_TIMEOUT * 1000)
+            # Check if navigation was successful
+            if response and response.status >= 400:
+                logger.warning(f"Playwright received HTTP {response.status}")
+        except PlaywrightTimeoutError:
+            logger.warning("Playwright navigation timeout, continuing anyway...")
+        except Exception as nav_error:
+            logger.warning(f"Playwright navigation error: {nav_error}")
+            # Continue anyway - page might have partially loaded
+        # Wait for document ready state
+        try:
+            page.wait_for_load_state('domcontentloaded', timeout=10000)
+            page.wait_for_load_state('load', timeout=10000)
+        except PlaywrightTimeoutError:
+            logger.debug("Load state timeout, continuing...")
+        # Additional wait for dynamic content to load
+        time.sleep(DYNAMIC_CONTENT_WAIT)
+        # Wait for body element to be present
+        try:
+            page.wait_for_selector('body', timeout=10000)
+        except PlaywrightTimeoutError:
+            logger.debug("Body selector timeout, continuing...")
+        # Get the fully rendered page source
+        page_source = page.content()
+        # Parse with BeautifulSoup
+        soup = BeautifulSoup(page_source, 'html.parser')
+        logger.info(f"✓ Successfully fetched and rendered page with Playwright")
+        # Return soup and resources (let caller handle cleanup)
+        if resources_created:
+            return soup, (playwright_instance, browser, context, page)
+        else:
+            return soup, None
+    except Exception as e:
+        logger.warning(f"Playwright fetch failed: {type(e).__name__}: {str(e)[:100]}")
+        if resources_created:
+            try:
+                if page:
+                    page.close()
+                if context:
+                    context.close()
+                if browser:
+                    browser.close()
+                if playwright_instance:
+                    playwright_instance.stop()
+            except:
+                pass
+        return None, None
+def fetch_page_with_playwright_safe(url, page=None):
+    """
+    Thread-safe wrapper for fetch_page_with_playwright that works in both sync and async contexts.
+    This function detects if it's running inside an asyncio event loop (e.g., FastAPI/uvicorn)
+    and automatically runs the Playwright sync API in a separate thread to avoid conflicts.
+    Args:
+        url (str): URL to fetch
+        page (playwright.sync_api.Page, optional): Existing page instance
+    Returns:
+        tuple: (BeautifulSoup object, playwright_resources) or (None, None) if failed
+    """
+    if _is_running_in_event_loop():
+        # Running in async context (e.g., FastAPI) - use thread pool
+        logger.debug("Detected async context - running Playwright in separate thread")
+        try:
+            # Run the sync function in a thread pool executor
+            # This isolates Playwright's sync API from the asyncio event loop
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(fetch_page_with_playwright, url, page)
+                result = future.result(timeout=PAGE_LOAD_TIMEOUT + 30)  # Add buffer to timeout
+                return result
+        except Exception as e:
+            logger.warning(f"Failed to run Playwright in thread: {type(e).__name__}: {str(e)[:100]}")
+            return None, None
+    else:
+        # Running in sync context (e.g., direct script execution) - call directly
+        logger.debug("Detected sync context - running Playwright directly")
+        return fetch_page_with_playwright(url, page)
+def get_modern_browser_headers(url=None):
+    """
+    Generate modern browser headers to mimic a real Chrome browser.
+    Args:
+        url (str, optional): The target URL for setting referer/origin
+    Returns:
+        dict: Dictionary of HTTP headers
+    """
+    headers = {
+        # Modern Chrome User-Agent (Chrome 120+)
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        # Accept headers
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Accept-Encoding': 'gzip, deflate, br',
+        # Security headers (Sec-Fetch-* headers)
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        # Additional browser headers
+        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+        'Sec-Ch-Ua-Mobile': '?0',
+        'Sec-Ch-Ua-Platform': '"Windows"',
+        # Connection settings
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+        # DNT (Do Not Track)
+        'DNT': '1',
+        # Cache control
+        'Cache-Control': 'max-age=0',
+    }
+    # Add referer if URL is provided
+    if url:
+        try:
+            parsed = urlparse(url)
+            if parsed.scheme and parsed.netloc:
+                origin = f"{parsed.scheme}://{parsed.netloc}"
+                headers['Origin'] = origin
+                headers['Referer'] = origin + '/'
+        except Exception:
+            pass
+    return headers
+def create_session_with_retries(max_retries=3):
+    """
+    Create a requests session with retry logic and connection pooling.
+    Args:
+        max_retries (int): Maximum number of retries for failed requests
+    Returns:
+        requests.Session: Configured session object
+    """
+    session = requests.Session()
+    # Configure retry strategy
+    retry_strategy = Retry(
+        total=max_retries,
+        backoff_factor=1,  # Wait 1s, 2s, 4s between retries
+        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP status codes
+        allowed_methods=["GET", "HEAD"],  # Only retry safe methods
+        raise_on_status=False  # Don't raise exception, let us handle it
+    )
+    # Mount adapter with retry strategy
+    adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=10)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+def preprocess_url(url):
+    """
+    Add http:// scheme to URL if missing.
+    Args:
+        url (str): Original URL
+    Returns:
+        str: URL with scheme
+    """
+    url = url.strip()
+    if not url.startswith(('http://', 'https://')):
+        return f'http://{url}'
+    return url
+def extract_feature_with_retry(soup, feature_name, extraction_func, max_retries=FEATURE_EXTRACTION_MAX_RETRIES):
+    """
+    Extract a single feature with retry logic.
+    All features are returned as integers:
+    - 'has_*' features return binary 0 or 1
+    - 'number_of_*' and 'length_of_*' features return whole numbers (integers)
+    - On failure, returns -1 (instead of np.nan) to maintain integer type consistency
+    Args:
+        soup (BeautifulSoup): Parsed HTML content
+        feature_name (str): Name of the feature being extracted
+        extraction_func (callable): Function that performs the extraction
+        max_retries (int): Maximum number of retry attempts
+    Returns:
+        int: Feature value as integer, or -1 if all retries fail
+    """
+    last_exception = None
+    for attempt in range(max_retries):
+        try:
+            result = extraction_func(soup)
+            # If we got a valid result, cast to int and return it
+            if result is not None and not (isinstance(result, float) and np.isnan(result)):
+                if attempt > 0:
+                    logger.debug(f"Feature '{feature_name}' extracted successfully on attempt {attempt + 1}")
+                # Ensure integer type for all features
+                return int(result)
+            # If result is None or np.nan, retry with a small delay
+            if attempt < max_retries - 1:
+                time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
+        except Exception as e:
+            last_exception = e
+            if attempt < max_retries - 1:
+                logger.debug(f"Retry {attempt + 1}/{max_retries} for '{feature_name}': {type(e).__name__}")
+                time.sleep(FEATURE_EXTRACTION_RETRY_DELAY * (attempt + 1))
+            continue
+    # All retries exhausted - return -1 to indicate failure while maintaining integer type
+    if last_exception:
+        logger.debug(f"Error extracting {feature_name} after {max_retries} attempts: {last_exception}")
+    return -1
+def extract_features(url):
+    """
+    Extract all 43 features from a URL and its webpage.
+    Args:
+        url (str): URL to extract features from
+    Returns:
+        dict: Dictionary containing all 43 features as integers.
+              - 'has_*' features: 0 (not present), 1 (present), or -1 (extraction failed/unreachable)
+              - 'number_of_*' and 'length_of_*' features: >= 0 count/length, or -1 (extraction failed/unreachable)
+    """
+    # Initialize all features with -1 (for unreachable sites)
+    # Using -1 instead of None to maintain integer type consistency
+    features = {
+        'has_title': -1,
+        'has_input': -1,
+        'has_button': -1,
+        'has_image': -1,
+        'has_submit': -1,
+        'has_link': -1,
+        'has_password': -1,
+        'has_email_input': -1,
+        'has_hidden_element': -1,
+        'has_audio': -1,
+        'has_video': -1,
+        'number_of_inputs': -1,
+        'number_of_buttons': -1,
+        'number_of_images': -1,
+        'number_of_option': -1,
+        'number_of_list': -1,
+        'number_of_th': -1,
+        'number_of_tr': -1,
+        'number_of_href': -1,
+        'number_of_paragraph': -1,
+        'number_of_script': -1,
+        'length_of_title': -1,
+        'has_h1': -1,
+        'has_h2': -1,
+        'has_h3': -1,
+        'length_of_text': -1,
+        'number_of_clickable_button': -1,
+        'number_of_a': -1,
+        'number_of_img': -1,
+        'number_of_div': -1,
+        'number_of_figure': -1,
+        'has_footer': -1,
+        'has_form': -1,
+        'has_text_area': -1,
+        'has_iframe': -1,
+        'has_text_input': -1,
+        'number_of_meta': -1,
+        'has_nav': -1,
+        'has_object': -1,
+        'has_picture': -1,
+        'number_of_sources': -1,
+        'number_of_span': -1,
+        'number_of_table': -1
+    }
+    # Preprocess URL
+    processed_url = preprocess_url(url)
+    # Try multiple approaches with increasing robustness
+    response = None
+    soup = None
+    last_error = None
+    # Approach 1: Use session with retry logic and modern headers
+    try:
+        logger.info(f"Attempting to fetch URL with session and retries: {processed_url}")
+        session = create_session_with_retries(max_retries=3)
+        headers = get_modern_browser_headers(processed_url)
+        response = session.get(
+            processed_url,
+            headers=headers,
+            timeout=15,
+            allow_redirects=True,
+            verify=False
+        )
+        # Check if we got a successful response
+        if response.status_code == 200:
+            logger.info(f"✓ Successfully fetched URL (status: {response.status_code})")
+            # Decode content with UTF-8 and replace errors to avoid encoding warnings
+            html_content = response.content.decode('utf-8', errors='replace')
+            soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
+        else:
+            logger.warning(f"Received HTTP {response.status_code} for {processed_url}")
+            raise requests.exceptions.HTTPError(f"HTTP {response.status_code}")
+    except requests.exceptions.Timeout as e:
+        last_error = f"Timeout error: Request took longer than 15 seconds"
+        logger.warning(f"✗ {last_error}")
+    except requests.exceptions.ConnectionError as e:
+        last_error = f"Connection error: Unable to establish connection to {processed_url}"
+        logger.warning(f"✗ {last_error}")
+    except requests.exceptions.HTTPError as e:
+        last_error = f"HTTP error: {str(e)}"
+        logger.warning(f"✗ {last_error}")
+    except requests.exceptions.TooManyRedirects as e:
+        last_error = f"Too many redirects: URL redirected too many times"
+        logger.warning(f"✗ {last_error}")
+    except Exception as e:
+        last_error = f"Unexpected error in approach 1: {type(e).__name__}: {str(e)[:100]}"
+        logger.warning(f"✗ {last_error}")
+    # Approach 2: Fallback to simple request with enhanced headers if first approach failed
+    if soup is None:
+        try:
+            logger.info(f"Trying fallback approach with enhanced headers...")
+            time.sleep(2)  # Brief delay before retry
+            # More complete headers to mimic a real browser
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'DNT': '1',
+                'Connection': 'keep-alive',
+                'Upgrade-Insecure-Requests': '1',
+                'Sec-Fetch-Dest': 'document',
+                'Sec-Fetch-Mode': 'navigate',
+                'Sec-Fetch-Site': 'none',
+                'Sec-Fetch-User': '?1',
+                'Cache-Control': 'max-age=0',
+            }
+            response = requests.get(
+                processed_url,
+                headers=headers,
+                timeout=10,
+                allow_redirects=True,
+                verify=False
+            )
+            if response.status_code == 200:
+                logger.info(f"✓ Fallback approach succeeded (status: {response.status_code})")
+                # Decode content with UTF-8 and replace errors to avoid encoding warnings
+                html_content = response.content.decode('utf-8', errors='replace')
+                soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
+            else:
+                last_error = f"HTTP {response.status_code}: {response.reason}"
+                logger.warning(f"✗ Fallback failed with HTTP {response.status_code}")
+        except Exception as e:
+            last_error = f"Fallback error: {type(e).__name__}: {str(e)[:100]}"
+            logger.warning(f"✗ {last_error}")
+    # Approach 3: Use Playwright for dynamic content if previous approaches failed
+    playwright_resources = None
+    if soup is None:
+        try:
+            logger.info(f"Trying Playwright approach for dynamic content...")
+            time.sleep(1)  # Brief delay before retry
+            soup, playwright_resources = fetch_page_with_playwright_safe(processed_url)
+            if soup is not None:
+                logger.info(f"✓ Playwright approach succeeded")
+            else:
+                last_error = "Playwright fetch failed"
+                logger.warning(f"✗ Playwright approach failed")
+        except Exception as e:
+            last_error = f"Playwright error: {type(e).__name__}: {str(e)[:100]}"
+            logger.warning(f"✗ {last_error}")
+    # If all approaches failed, return features with None values
+    if soup is None:
+        error_msg = last_error if last_error else "Unknown error occurred"
+        logger.error(f"  ✗ Failed to extract features from {processed_url}: {error_msg}")
+        print(f"  ✗ Failed to extract features: {error_msg}")
+        return features
+    # Successfully fetched content, now extract features
+    # Use np.nan for parsing errors, 0/1 for missing/present elements
+    # Each feature extraction includes retry logic for robustness
+    # 1. has_title
+    features['has_title'] = extract_feature_with_retry(
+        soup, 'has_title',
+        lambda s: 1 if s.find('title') else 0
+    )
+    # 2. has_input
+    features['has_input'] = extract_feature_with_retry(
+        soup, 'has_input',
+        lambda s: 1 if s.find('input') else 0
+    )
+    # 3. has_button
+    features['has_button'] = extract_feature_with_retry(
+        soup, 'has_button',
+        lambda s: 1 if s.find('button') else 0
+    )
+    # 4. has_image
+    features['has_image'] = extract_feature_with_retry(
+        soup, 'has_image',
+        lambda s: 1 if s.find('img') else 0
+    )
+    # 5. has_submit
+    features['has_submit'] = extract_feature_with_retry(
+        soup, 'has_submit',
+        lambda s: 1 if s.find('input', {'type': 'submit'}) else 0
+    )
+    # 6. has_link
+    features['has_link'] = extract_feature_with_retry(
+        soup, 'has_link',
+        lambda s: 1 if s.find('a') else 0
+    )
+    # 7. has_password
+    features['has_password'] = extract_feature_with_retry(
+        soup, 'has_password',
+        lambda s: 1 if s.find('input', {'type': 'password'}) else 0
+    )
+    # 8. has_email_input
+    features['has_email_input'] = extract_feature_with_retry(
+        soup, 'has_email_input',
+        lambda s: 1 if s.find('input', {'type': 'email'}) else 0
+    )
+    # 9. has_hidden_element
+    features['has_hidden_element'] = extract_feature_with_retry(
+        soup, 'has_hidden_element',
+        lambda s: 1 if s.find('input', {'type': 'hidden'}) else 0
+    )
+    # 10. has_audio
+    features['has_audio'] = extract_feature_with_retry(
+        soup, 'has_audio',
+        lambda s: 1 if s.find('audio') else 0
+    )
+    # 11. has_video
+    features['has_video'] = extract_feature_with_retry(
+        soup, 'has_video',
+        lambda s: 1 if s.find('video') else 0
+    )
+    # 12. number_of_inputs
+    features['number_of_inputs'] = extract_feature_with_retry(
+        soup, 'number_of_inputs',
+        lambda s: len(s.find_all('input'))
+    )
+    # 13. number_of_buttons
+    features['number_of_buttons'] = extract_feature_with_retry(
+        soup, 'number_of_buttons',
+        lambda s: len(s.find_all('button'))
+    )
+    # 14. number_of_images
+    features['number_of_images'] = extract_feature_with_retry(
+        soup, 'number_of_images',
+        lambda s: len(s.find_all('img'))
+    )
+    # 15. number_of_option
+    features['number_of_option'] = extract_feature_with_retry(
+        soup, 'number_of_option',
+        lambda s: len(s.find_all('option'))
+    )
+    # 16. number_of_list
+    features['number_of_list'] = extract_feature_with_retry(
+        soup, 'number_of_list',
+        lambda s: len(s.find_all('li'))
+    )
+    # 17. number_of_th
+    features['number_of_th'] = extract_feature_with_retry(
+        soup, 'number_of_th',
+        lambda s: len(s.find_all('th'))
+    )
+    # 18. number_of_tr
+    features['number_of_tr'] = extract_feature_with_retry(
+        soup, 'number_of_tr',
+        lambda s: len(s.find_all('tr'))
+    )
+    # 19. number_of_href
+    features['number_of_href'] = extract_feature_with_retry(
+        soup, 'number_of_href',
+        lambda s: len(s.find_all('a', href=True))
+    )
+    # 20. number_of_paragraph
+    features['number_of_paragraph'] = extract_feature_with_retry(
+        soup, 'number_of_paragraph',
+        lambda s: len(s.find_all('p'))
+    )
+    # 21. number_of_script
+    features['number_of_script'] = extract_feature_with_retry(
+        soup, 'number_of_script',
+        lambda s: len(s.find_all('script'))
+    )
+    # 22. length_of_title
+    def extract_title_length(s):
+        title_tag = s.find('title')
+        return len(title_tag.get_text()) if title_tag else 0
+    features['length_of_title'] = extract_feature_with_retry(
+        soup, 'length_of_title',
+        extract_title_length
+    )
+    # 23. has_h1
+    features['has_h1'] = extract_feature_with_retry(
+        soup, 'has_h1',
+        lambda s: 1 if s.find('h1') else 0
+    )
+    # 24. has_h2
+    features['has_h2'] = extract_feature_with_retry(
+        soup, 'has_h2',
+        lambda s: 1 if s.find('h2') else 0
+    )
+    # 25. has_h3
+    features['has_h3'] = extract_feature_with_retry(
+        soup, 'has_h3',
+        lambda s: 1 if s.find('h3') else 0
+    )
+    # 26. length_of_text
+    def extract_text_length(s):
+        # Create a copy to avoid modifying the original soup
+        soup_copy = BeautifulSoup(str(s), 'html.parser')
+        for script_or_style in soup_copy(['script', 'style']):
+            script_or_style.decompose()
+        body = soup_copy.find('body')
+        if body:
+            text = body.get_text()
+            return len(text)
+        return 0
+    features['length_of_text'] = extract_feature_with_retry(
+        soup, 'length_of_text',
+        extract_text_length
+    )
+    # 27. number_of_clickable_button
+    def extract_clickable_buttons(s):
+        buttons = len(s.find_all('button'))
+        input_buttons = len(s.find_all('input', {'type': ['button', 'submit', 'reset']}))
+        return buttons + input_buttons
+    features['number_of_clickable_button'] = extract_feature_with_retry(
+        soup, 'number_of_clickable_button',
+        extract_clickable_buttons
+    )
+    # 28. number_of_a
+    features['number_of_a'] = extract_feature_with_retry(
+        soup, 'number_of_a',
+        lambda s: len(s.find_all('a'))
+    )
+    # 29. number_of_img
+    features['number_of_img'] = extract_feature_with_retry(
+        soup, 'number_of_img',
+        lambda s: len(s.find_all('img'))
+    )
+    # 30. number_of_div
+    features['number_of_div'] = extract_feature_with_retry(
+        soup, 'number_of_div',
+        lambda s: len(s.find_all('div'))
+    )
+    # 31. number_of_figure
+    features['number_of_figure'] = extract_feature_with_retry(
+        soup, 'number_of_figure',
+        lambda s: len(s.find_all('figure'))
+    )
+    # 32. has_footer
+    features['has_footer'] = extract_feature_with_retry(
+        soup, 'has_footer',
+        lambda s: 1 if s.find('footer') else 0
+    )
+    # 33. has_form
+    features['has_form'] = extract_feature_with_retry(
+        soup, 'has_form',
+        lambda s: 1 if s.find('form') else 0
+    )
+    # 34. has_text_area
+    features['has_text_area'] = extract_feature_with_retry(
+        soup, 'has_text_area',
+        lambda s: 1 if s.find('textarea') else 0
+    )
+    # 35. has_iframe
+    features['has_iframe'] = extract_feature_with_retry(
+        soup, 'has_iframe',
+        lambda s: 1 if s.find('iframe') else 0
+    )
+    # 36. has_text_input
+    features['has_text_input'] = extract_feature_with_retry(
+        soup, 'has_text_input',
+        lambda s: 1 if s.find('input', {'type': 'text'}) else 0
+    )
+    # 37. number_of_meta
+    features['number_of_meta'] = extract_feature_with_retry(
+        soup, 'number_of_meta',
+        lambda s: len(s.find_all('meta'))
+    )
+    # 38. has_nav
+    features['has_nav'] = extract_feature_with_retry(
+        soup, 'has_nav',
+        lambda s: 1 if s.find('nav') else 0
+    )
+    # 39. has_object
+    features['has_object'] = extract_feature_with_retry(
+        soup, 'has_object',
+        lambda s: 1 if s.find('object') else 0
+    )
+    # 40. has_picture
+    features['has_picture'] = extract_feature_with_retry(
+        soup, 'has_picture',
+        lambda s: 1 if s.find('picture') else 0
+    )
+    # 41. number_of_sources
+    features['number_of_sources'] = extract_feature_with_retry(
+        soup, 'number_of_sources',
+        lambda s: len(s.find_all('source'))
+    )
+    # 42. number_of_span
+    features['number_of_span'] = extract_feature_with_retry(
+        soup, 'number_of_span',
+        lambda s: len(s.find_all('span'))
+    )
+    # 43. number_of_table
+    features['number_of_table'] = extract_feature_with_retry(
+        soup, 'number_of_table',
+        lambda s: len(s.find_all('table'))
+    )
+    # Clean up Playwright resources if they were created
+    if playwright_resources is not None:
+        try:
+            playwright_instance, browser, context, page = playwright_resources
+            if page:
+                page.close()
+            if context:
+                context.close()
+            if browser:
+                browser.close()
+            if playwright_instance:
+                playwright_instance.stop()
+            logger.debug("Playwright resources closed successfully")
+        except Exception as e:
+            logger.debug(f"Error closing Playwright resources: {e}")
+    # Count successfully extracted features
+    # Features with value >= 0 are successfully extracted, -1 indicates failure
+    successful_features = sum(1 for v in features.values() if isinstance(v, int) and v >= 0)
+    failed_features = sum(1 for v in features.values() if v == -1)
+    if failed_features > 0:
+        logger.warning(f"⚠ Extracted {successful_features}/43 features from {processed_url} ({failed_features} failed)")
+    else:
+        logger.info(f"✓ Successfully extracted all 43 features from {processed_url}")
+    return features

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# FastAPI and server
+fastapi==0.115.0
+uvicorn[standard]==0.32.0
+pydantic==2.9.2
+huggingface_hub== 0.35.3
+# Data processing
+pandas==2.2.2
+numpy==2.0.2
+# Machine learning
+scikit-learn==1.6.1
+lightgbm==4.6.0
+xgboost==3.1.2
+catboost==1.2.8
+joblib==1.5.2
+# Feature extraction dependencies
+requests==2.32.3
+beautifulsoup4==4.12.3
+urllib3==2.2.3
+playwright==1.48.0

run.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Simple script to run the FastAPI application.
+"""
+import uvicorn
+import sys
+import os
+def main():
+    """Run the FastAPI application."""
+    # Get port from environment variable or use default
+    port = int(os.getenv("PORT", "8000"))
+    # Get host from environment variable or use default
+    host = os.getenv("HOST", "0.0.0.0")
+    # Check if reload flag is passed
+    reload = "--reload" in sys.argv or "-r" in sys.argv
+    print("="*60)
+    print("🔒 Phishing URL Detection API")
+    print("="*60)
+    print(f"Starting server on {host}:{port}")
+    print(f"Reload mode: {'Enabled' if reload else 'Disabled'}")
+    print(f"\nAPI Documentation:")
+    print(f"  - Swagger UI: http://{host if host != '0.0.0.0' else 'localhost'}:{port}/docs")
+    print(f"  - ReDoc:      http://{host if host != '0.0.0.0' else 'localhost'}:{port}/redoc")
+    print("="*60)
+    uvicorn.run(
+        "main:app",
+        host=host,
+        port=port,
+        reload=reload,
+        log_level="info"
+    )
+if __name__ == "__main__":
+    main()