Spaces:

xxemrzru
/

Phishing-URL_Model_FastAPI

Build error

App Files Files Community

Rasel Santillan commited on Oct 20, 2025

Commit

7a3576b

1 Parent(s): f3f638f

Add application file

Browse files

Files changed (7) hide show

Dockerfile +16 -0
__pycache__/app.cpython-312.pyc +0 -0
app.py +154 -0
model/__pycache__/model.cpython-312.pyc +0 -0
model/model.py +192 -0
model/url_stacking_model.joblib +3 -0
requirements.txt +16 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (5.89 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""
+FastAPI application for phishing URL detection.
+Provides a REST API endpoint to predict if a URL is phishing or legitimate.
+"""
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field, validator
+from typing import Optional
+import uvicorn
+from model.model import load_model, predict_url
+# Initialize FastAPI app
+app = FastAPI(
+    title="Phishing URL Detection API",
+    description="API for detecting phishing URLs using machine learning",
+    version="1.0.0"
+)
+# Add CORS middleware to allow web access
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, replace with specific origins
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load model on startup
+model_components = None
+@app.on_event("startup")
+async def startup_event():
+    """Load the model when the application starts."""
+    global model_components
+    try:
+        model_components = load_model()
+        print("✅ Model loaded successfully on startup")
+    except Exception as e:
+        print(f"❌ Failed to load model on startup: {e}")
+        raise
+# Request and Response Models
+class URLRequest(BaseModel):
+    """Request model for URL prediction."""
+    url: str = Field(..., description="The URL to check for phishing", min_length=1)
+    @validator('url')
+    def validate_url(cls, v):
+        """Validate that URL is not empty after stripping whitespace."""
+        if not v.strip():
+            raise ValueError('URL cannot be empty')
+        return v.strip()
+    class Config:
+        schema_extra = {
+            "example": {
+                "url": "https://www.google.com"
+            }
+        }
+class PredictionResponse(BaseModel):
+    """Response model for URL prediction."""
+    url: str = Field(..., description="The URL that was analyzed")
+    predicted_label: Optional[int] = Field(None, description="0 for legitimate, 1 for phishing, None if error")
+    prediction: str = Field(..., description="Human-readable prediction: 'legitimate', 'phishing', 'unknown', or 'error'")
+    phish_probability: Optional[float] = Field(None, description="Probability of being phishing (0.0 to 1.0)")
+    confidence: Optional[float] = Field(None, description="Confidence percentage of the prediction")
+    features_extracted: bool = Field(..., description="Whether features were successfully extracted from the URL")
+    error: Optional[str] = Field(None, description="Error message if prediction failed")
+    class Config:
+        schema_extra = {
+            "example": {
+                "url": "https://www.google.com",
+                "predicted_label": 0,
+                "prediction": "legitimate",
+                "phish_probability": 0.0234,
+                "confidence": 97.66,
+                "features_extracted": True,
+                "error": None
+            }
+        }
+# API Endpoints
+@app.get("/")
+async def root():
+    """Root endpoint with API information."""
+    return {
+        "message": "Phishing URL Detection API",
+        "version": "1.0.0",
+        "endpoints": {
+            "/predict": "POST - Predict if a URL is phishing or legitimate",
+            "/health": "GET - Check API health status",
+            "/docs": "GET - Interactive API documentation"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "model_loaded": model_components is not None
+    }
+@app.post("/predict", response_model=PredictionResponse)
+async def predict(request: URLRequest):
+    """
+    Predict if a URL is phishing or legitimate.
+    Args:
+        request: URLRequest containing the URL to analyze
+    Returns:
+        PredictionResponse with prediction results
+    Raises:
+        HTTPException: If model is not loaded or prediction fails
+    """
+    if model_components is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Model not loaded. Please try again later."
+        )
+    try:
+        # Make prediction
+        result = predict_url(request.url, model_components)
+        return PredictionResponse(**result)
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Prediction failed: {str(e)}"
+        )
+# Run the application
+if __name__ == "__main__":
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=7860,
+        reload=True
+    )

model/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (6.89 kB). View file

model/model.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Model prediction helper module for phishing URL detection.
+Handles model loading, feature extraction, and prediction.
+"""
+import os
+import sys
+import numpy as np
+import pandas as pd
+import joblib
+import warnings
+warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)
+# Add parent directory to path to import url_feature_extraction module
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+from url_feature_extraction.url_feature_extractor import extract_features
+# Global variable to cache the loaded model
+_model_cache = None
+def load_model(model_path="model/url_stacking_model.joblib"):
+    """
+    Load the saved stacking model from file.
+    Args:
+        model_path (str): Path to the model file relative to the FastAPI app directory
+    Returns:
+        dict: Dictionary containing model components:
+            - base_models: Dictionary of base models
+            - meta_scaler: Scaler for meta features
+            - meta_model: Meta model for final prediction
+            - feature_names: List of feature names
+            - model_names: List of model names
+    """
+    global _model_cache
+    # Return cached model if already loaded
+    if _model_cache is not None:
+        return _model_cache
+    # Construct absolute path to model file
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    full_model_path = os.path.join(current_dir, "..", model_path)
+    full_model_path = os.path.normpath(full_model_path)
+    if not os.path.exists(full_model_path):
+        raise FileNotFoundError(f"Model file not found at: {full_model_path}")
+    # Load model
+    model_data = joblib.load(full_model_path)
+    print(f"✅ Model loaded successfully from: {full_model_path}")
+    _model_cache = {
+        "base_models": model_data["base_models"],
+        "meta_scaler": model_data["meta_scaler"],
+        "meta_model": model_data["meta_model"],
+        "feature_names": model_data["feature_names"],
+        "model_names": model_data["model_names"]
+    }
+    return _model_cache
+def predict_url(url: str, model_components: dict = None):
+    """
+    Make prediction for a given URL.
+    This function:
+    1. Extracts features from the raw URL using url_feature_extractor
+    2. Converts features to the format expected by the model
+    3. Makes prediction using the stacking model
+    Args:
+        url (str): Raw URL to predict
+        model_components (dict, optional): Pre-loaded model components.
+                                          If None, will load the model.
+    Returns:
+        dict: Dictionary containing:
+            - url: The input URL
+            - predicted_label: 0 (legitimate) or 1 (phishing)
+            - prediction: "legitimate" or "phishing"
+            - phish_probability: Probability of being phishing (0.0 to 1.0)
+            - confidence: Confidence percentage
+            - features_extracted: Boolean indicating if features were successfully extracted
+    """
+    # Load model if not provided
+    if model_components is None:
+        model_components = load_model()
+    # Extract features from URL
+    features_dict = extract_features(url)
+    # Check if feature extraction was successful
+    if features_dict.get('has_title') is None:
+        # URL was unreachable or feature extraction failed
+        return {
+            "url": url,
+            "predicted_label": None,
+            "prediction": "unknown",
+            "phish_probability": None,
+            "confidence": None,
+            "features_extracted": False,
+            "error": "Failed to extract features from URL. The URL may be unreachable or invalid."
+        }
+    # Make prediction using the features
+    try:
+        prediction_result = predict_from_features(features_dict, model_components)
+        predicted_label = prediction_result["predicted_label"]
+        phish_probability = prediction_result["phish_probability"]
+        # Calculate confidence
+        confidence = max(phish_probability, 1 - phish_probability) * 100
+        return {
+            "url": url,
+            "predicted_label": predicted_label,
+            "prediction": "phishing" if predicted_label == 1 else "legitimate",
+            "phish_probability": round(phish_probability, 4),
+            "confidence": round(confidence, 2),
+            "features_extracted": True
+        }
+    except Exception as e:
+        return {
+            "url": url,
+            "predicted_label": None,
+            "prediction": "error",
+            "phish_probability": None,
+            "confidence": None,
+            "features_extracted": True,
+            "error": f"Prediction error: {str(e)}"
+        }
+def predict_from_features(features_dict: dict, model_components: dict):
+    """
+    Make predictions given a dictionary of extracted features.
+    This function implements the stacking model prediction:
+    - Level 0: Base models make predictions
+    - Level 1: Meta model combines base model predictions
+    Args:
+        features_dict (dict): Dictionary where keys are feature names and values are feature values
+        model_components (dict): The loaded components returned by load_model()
+    Returns:
+        dict: Contains 'predicted_label' (0 or 1) and 'phish_probability' (float)
+    """
+    base_models = model_components["base_models"]
+    meta_scaler = model_components["meta_scaler"]
+    meta_model = model_components["meta_model"]
+    feature_names = model_components["feature_names"]
+    model_names = model_components["model_names"]
+    # Convert to DataFrame to ensure shape consistency
+    X = pd.DataFrame([features_dict])
+    # Ensure all required columns exist
+    missing_cols = set(feature_names) - set(X.columns)
+    if missing_cols:
+        raise ValueError(f"❌ Missing required features: {missing_cols}")
+    # Keep only known features and order them correctly
+    X = X[feature_names]
+    # ------------------------------
+    # Level 0: Base model predictions
+    # ------------------------------
+    meta_features = np.zeros((X.shape[0], len(base_models)))
+    for idx, (model_name, model) in enumerate(base_models.items()):
+        meta_features[:, idx] = model.predict_proba(X)[:, 1]
+    meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
+    # ------------------------------
+    # Level 1: Meta-model prediction
+    # ------------------------------
+    meta_scaled = meta_scaler.transform(meta_features_df)
+    meta_scaled = pd.DataFrame(meta_scaled, columns=meta_features_df.columns)
+    final_pred = meta_model.predict(meta_scaled)[0]
+    final_prob = meta_model.predict_proba(meta_scaled)[:, 1][0]
+    return {
+        "predicted_label": int(final_pred),
+        "phish_probability": float(final_prob)
+    }

model/url_stacking_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc4e81eb5ce124016facc45fbe74d8b71f250c7676003b00d17f67bb730b5840
+size 279828900

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+# FastAPI and web server
+fastapi
+uvicorn[standard]
+# Data processing and ML
+pandas==2.2.2
+numpy==2.0.2
+scikit-learn==1.6.1
+lightgbm==4.6.0
+xgboost==3.0.5
+joblib==1.5.2
+# Feature extraction dependencies
+requests
+beautifulsoup4
+urllib3