File size: 6,774 Bytes
7a3576b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
Model prediction helper module for phishing URL detection.
Handles model loading, feature extraction, and prediction.
"""

import os
import sys
import numpy as np
import pandas as pd
import joblib
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names", category=UserWarning)

# Add parent directory to path to import url_feature_extraction module
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from url_feature_extraction.url_feature_extractor import extract_features

# Global variable to cache the loaded model
_model_cache = None


def load_model(model_path="model/url_stacking_model.joblib"):
    """
    Load the saved stacking model from file.

    Args:
        model_path (str): Path to the model file relative to the FastAPI app directory

    Returns:
        dict: Dictionary containing model components:
            - base_models: Dictionary of base models
            - meta_scaler: Scaler for meta features
            - meta_model: Meta model for final prediction
            - feature_names: List of feature names
            - model_names: List of model names
    """
    global _model_cache

    # Return cached model if already loaded
    if _model_cache is not None:
        return _model_cache

    # Construct absolute path to model file
    current_dir = os.path.dirname(os.path.abspath(__file__))
    full_model_path = os.path.join(current_dir, "..", model_path)
    full_model_path = os.path.normpath(full_model_path)

    if not os.path.exists(full_model_path):
        raise FileNotFoundError(f"Model file not found at: {full_model_path}")

    # Load model
    model_data = joblib.load(full_model_path)
    print(f"✅ Model loaded successfully from: {full_model_path}")

    _model_cache = {
        "base_models": model_data["base_models"],
        "meta_scaler": model_data["meta_scaler"],
        "meta_model": model_data["meta_model"],
        "feature_names": model_data["feature_names"],
        "model_names": model_data["model_names"]
    }

    return _model_cache


def predict_url(url: str, model_components: dict = None):
    """
    Make prediction for a given URL.

    This function:
    1. Extracts features from the raw URL using url_feature_extractor
    2. Converts features to the format expected by the model
    3. Makes prediction using the stacking model

    Args:
        url (str): Raw URL to predict
        model_components (dict, optional): Pre-loaded model components.
                                          If None, will load the model.

    Returns:
        dict: Dictionary containing:
            - url: The input URL
            - predicted_label: 0 (legitimate) or 1 (phishing)
            - prediction: "legitimate" or "phishing"
            - phish_probability: Probability of being phishing (0.0 to 1.0)
            - confidence: Confidence percentage
            - features_extracted: Boolean indicating if features were successfully extracted
    """
    # Load model if not provided
    if model_components is None:
        model_components = load_model()

    # Extract features from URL
    features_dict = extract_features(url)

    # Check if feature extraction was successful
    if features_dict.get('has_title') is None:
        # URL was unreachable or feature extraction failed
        return {
            "url": url,
            "predicted_label": None,
            "prediction": "unknown",
            "phish_probability": None,
            "confidence": None,
            "features_extracted": False,
            "error": "Failed to extract features from URL. The URL may be unreachable or invalid."
        }

    # Make prediction using the features
    try:
        prediction_result = predict_from_features(features_dict, model_components)

        predicted_label = prediction_result["predicted_label"]
        phish_probability = prediction_result["phish_probability"]

        # Calculate confidence
        confidence = max(phish_probability, 1 - phish_probability) * 100

        return {
            "url": url,
            "predicted_label": predicted_label,
            "prediction": "phishing" if predicted_label == 1 else "legitimate",
            "phish_probability": round(phish_probability, 4),
            "confidence": round(confidence, 2),
            "features_extracted": True
        }
    except Exception as e:
        return {
            "url": url,
            "predicted_label": None,
            "prediction": "error",
            "phish_probability": None,
            "confidence": None,
            "features_extracted": True,
            "error": f"Prediction error: {str(e)}"
        }


def predict_from_features(features_dict: dict, model_components: dict):
    """
    Make predictions given a dictionary of extracted features.

    This function implements the stacking model prediction:
    - Level 0: Base models make predictions
    - Level 1: Meta model combines base model predictions

    Args:
        features_dict (dict): Dictionary where keys are feature names and values are feature values
        model_components (dict): The loaded components returned by load_model()

    Returns:
        dict: Contains 'predicted_label' (0 or 1) and 'phish_probability' (float)
    """
    base_models = model_components["base_models"]
    meta_scaler = model_components["meta_scaler"]
    meta_model = model_components["meta_model"]
    feature_names = model_components["feature_names"]
    model_names = model_components["model_names"]

    # Convert to DataFrame to ensure shape consistency
    X = pd.DataFrame([features_dict])

    # Ensure all required columns exist
    missing_cols = set(feature_names) - set(X.columns)
    if missing_cols:
        raise ValueError(f"❌ Missing required features: {missing_cols}")

    # Keep only known features and order them correctly
    X = X[feature_names]

    # ------------------------------
    # Level 0: Base model predictions
    # ------------------------------
    meta_features = np.zeros((X.shape[0], len(base_models)))
    for idx, (model_name, model) in enumerate(base_models.items()):
        meta_features[:, idx] = model.predict_proba(X)[:, 1]

    meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])

    # ------------------------------
    # Level 1: Meta-model prediction
    # ------------------------------
    meta_scaled = meta_scaler.transform(meta_features_df)
    meta_scaled = pd.DataFrame(meta_scaled, columns=meta_features_df.columns)

    final_pred = meta_model.predict(meta_scaled)[0]
    final_prob = meta_model.predict_proba(meta_scaled)[:, 1][0]

    return {
        "predicted_label": int(final_pred),
        "phish_probability": float(final_prob)
    }