Spaces:

xxemrzru
/

url-phish-fastapi

Runtime error

App Files Files Community

Rasel Santillan commited on Jan 7

Commit

b5d7eef

1 Parent(s): 8a9ac80

Describe your changes

Browse files

Files changed (2) hide show

categorization.py +24 -17
model/model.py +77 -17

categorization.py CHANGED Viewed

@@ -14,7 +14,7 @@ class RiskCategory(str, Enum):
     SAFE = "Safe"
     LOW = "Low"
     MODERATE = "Moderate"
-    HIGH = "Dangerous"
     CRITICAL = "Critical"
@@ -26,34 +26,41 @@ class BinaryClassification(str, Enum):
 # Risk category thresholds (score is 0-100 scale)
 RISK_THRESHOLDS = {
-    RiskCategory.SAFE: (0, 25),        # score < 25
-    RiskCategory.LOW: (25, 50),        # 25 <= score < 50
-    RiskCategory.MODERATE: (50, 70),   # 50 <= score < 70
-    RiskCategory.HIGH: (70, 85),       # 70 <= score < 85
-    RiskCategory.CRITICAL: (85, 101),  # score >= 85
 }
 # Binary classification threshold
-PHISHING_THRESHOLD = 70  # score >= 70 is classified as Phishing
 def get_risk_category(phish_probability_score: float) -> RiskCategory:
     """
     Determine the risk category based on phishing probability score.
     Args:
         phish_probability_score: Phishing probability score (0-100 scale)
     Returns:
         RiskCategory: The corresponding risk category
     """
-    if phish_probability_score < 25:
         return RiskCategory.SAFE
-    elif phish_probability_score < 50:
         return RiskCategory.LOW
-    elif phish_probability_score < 70:
         return RiskCategory.MODERATE
-    elif phish_probability_score < 85:
         return RiskCategory.HIGH
     else:
         return RiskCategory.CRITICAL
@@ -62,14 +69,14 @@ def get_risk_category(phish_probability_score: float) -> RiskCategory:
 def get_binary_classification(phish_probability_score: float) -> BinaryClassification:
     """
     Determine the binary classification based on phishing probability score.
     Args:
         phish_probability_score: Phishing probability score (0-100 scale)
     Returns:
-        BinaryClassification: Legitimate if score < 70, Phishing otherwise
     """
-    if phish_probability_score < PHISHING_THRESHOLD:
         return BinaryClassification.LEGITIMATE
     else:
         return BinaryClassification.PHISHING

     SAFE = "Safe"
     LOW = "Low"
     MODERATE = "Moderate"
+    HIGH = "High"
     CRITICAL = "Critical"
 # Risk category thresholds (score is 0-100 scale)
 RISK_THRESHOLDS = {
+    RiskCategory.SAFE: (0, 30),        # 0-30%: score <= 30
+    RiskCategory.LOW: (30, 50),        # 31-50%: 30 < score <= 50
+    RiskCategory.MODERATE: (50, 70),   # 51-70%: 50 < score <= 70
+    RiskCategory.HIGH: (70, 85),       # 71-85%: 70 < score <= 85
+    RiskCategory.CRITICAL: (85, 101),  # 86-100%: score > 85
 }
 # Binary classification threshold
+PHISHING_THRESHOLD = 70  # score > 70 is classified as Phishing
 def get_risk_category(phish_probability_score: float) -> RiskCategory:
     """
     Determine the risk category based on phishing probability score.
+    Thresholds:
+    - Safe: 0-30%
+    - Low: 31-50%
+    - Moderate: 51-70%
+    - High: 71-85%
+    - Critical: 86-100%
     Args:
         phish_probability_score: Phishing probability score (0-100 scale)
     Returns:
         RiskCategory: The corresponding risk category
     """
+    if phish_probability_score <= 30:
         return RiskCategory.SAFE
+    elif phish_probability_score <= 50:
         return RiskCategory.LOW
+    elif phish_probability_score <= 70:
         return RiskCategory.MODERATE
+    elif phish_probability_score <= 85:
         return RiskCategory.HIGH
     else:
         return RiskCategory.CRITICAL
 def get_binary_classification(phish_probability_score: float) -> BinaryClassification:
     """
     Determine the binary classification based on phishing probability score.
     Args:
         phish_probability_score: Phishing probability score (0-100 scale)
     Returns:
+        BinaryClassification: Legitimate if score <= 70, Phishing otherwise
     """
+    if phish_probability_score <= PHISHING_THRESHOLD:
         return BinaryClassification.LEGITIMATE
     else:
         return BinaryClassification.PHISHING

model/model.py CHANGED Viewed

@@ -45,6 +45,8 @@ def load_model() -> Dict[str, Any]:
     Returns:
         dict: Dictionary containing model components:
             - base_models: Dictionary of base models
             - meta_model: Final meta model
             - feature_names: List of feature names
             - model_names: List of base model names
@@ -69,12 +71,15 @@ def load_model() -> Dict[str, Any]:
         # Cache the model
         _model_cache = {
             "base_models": model_data["base_models"],
             "meta_model": model_data["meta_model"],
             "feature_names": model_data["feature_names"],
             "model_names": model_data["model_names"]
         }
-        logger.info("✅ Model loaded successfully")
         return _model_cache
     except Exception as e:
@@ -97,6 +102,7 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
         ValueError: If required features are missing
     """
     base_models = model_components["base_models"]
     meta_model = model_components["meta_model"]
     feature_names = model_components["feature_names"]
     model_names = model_components["model_names"]
@@ -120,8 +126,11 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
     meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
     # Level 1: Meta-model prediction
-    final_pred = meta_model.predict(meta_features_df)[0]
-    final_prob = meta_model.predict_proba(meta_features_df)[:, 1][0]
     return {
         "predicted_label": int(final_pred),
@@ -129,6 +138,37 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
     }
 def predict_url(url: str) -> Dict[str, Any]:
     """
     Main prediction function that takes a raw URL and returns prediction.
@@ -136,7 +176,8 @@ def predict_url(url: str) -> Dict[str, Any]:
     This function:
     1. Loads the model (cached after first load)
     2. Extracts features from the URL using url_feature_extractor
-    3. Makes prediction using the stacking model
     Args:
         url: Raw URL string to analyze
@@ -173,6 +214,13 @@ def predict_url(url: str) -> Dict[str, Any]:
                 "error": "Failed to extract features - URL may be unreachable"
             }
         # Make prediction
         logger.info("Making prediction...")
         prediction_result = predict_from_features(features_dict, model_components)
@@ -223,10 +271,11 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
         # Load model components
         model_components = load_model()
         base_models = model_components["base_models"]
         meta_model = model_components["meta_model"]
         feature_names = model_components["feature_names"]
         model_names = model_components["model_names"]
         # Extract features from URL
         logger.info(f"Extracting features for update from URL: {url}")
         features_dict = extract_features(url)
@@ -236,34 +285,43 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
             logger.warning(f"Feature extraction failed for URL update: {url}")
             return None, False
         # Convert to DataFrame and ensure proper ordering
         X = pd.DataFrame([features_dict])
         missing_cols = set(feature_names) - set(X.columns)
         if missing_cols:
             raise ValueError(f"Missing required features: {missing_cols}")
         X = X[feature_names]
         # Generate meta-features using base models (probability outputs)
         meta_features = np.zeros((X.shape[0], len(base_models)))
         for idx, (model_name, model) in enumerate(base_models.items()):
             meta_features[:, idx] = model.predict_proba(X)[:, 1]
         meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
         # Update the SGD meta model using partial_fit
         logger.info(f"Updating meta model with partial_fit for label: {true_label}")
-        meta_model.partial_fit(meta_features_df, [true_label], classes=[0, 1])
         # Update the cached model with the new meta model
         global _model_cache
         if _model_cache is not None:
             _model_cache["meta_model"] = meta_model
         # Save the updated model to disk
         save_updated_model(model_components, meta_model)
         logger.info(f"✅ Model updated successfully for URL: {url}")
-        return meta_features_df.values[0], True
     except Exception as e:
         logger.error(f"❌ Failed to update model: {str(e)}")
@@ -273,26 +331,28 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
 def save_updated_model(model_components: Dict[str, Any], updated_meta_model) -> None:
     """
     Save the updated model components to disk.
     Args:
         model_components: Dictionary containing model components
         updated_meta_model: The updated SGD meta model
     """
     try:
         model_path = get_model_path()
         # Create updated model data
         updated_model_data = {
             "base_models": model_components["base_models"],
             "meta_model": updated_meta_model,  # Use the updated meta model
             "feature_names": model_components["feature_names"],
             "model_names": model_components["model_names"]
         }
         # Save to disk
         joblib.dump(updated_model_data, model_path)
         logger.info(f"✅ Updated model saved to: {model_path}")
     except Exception as e:
         logger.error(f"❌ Failed to save updated model: {str(e)}")
         raise

     Returns:
         dict: Dictionary containing model components:
             - base_models: Dictionary of base models
+            - meta_scaler: Scaler for meta features (RobustScaler or StandardScaler)
+            - scaler_name: Name of the scaler used (for logging)
             - meta_model: Final meta model
             - feature_names: List of feature names
             - model_names: List of base model names
         # Cache the model
         _model_cache = {
             "base_models": model_data["base_models"],
+            "meta_scaler": model_data["meta_scaler"],
+            "scaler_name": model_data.get("scaler_name", "Unknown"),
             "meta_model": model_data["meta_model"],
             "feature_names": model_data["feature_names"],
             "model_names": model_data["model_names"]
         }
+        scaler_name = _model_cache["scaler_name"]
+        logger.info(f"✅ Model loaded successfully (Meta scaler: {scaler_name})")
         return _model_cache
     except Exception as e:
         ValueError: If required features are missing
     """
     base_models = model_components["base_models"]
+    meta_scaler = model_components["meta_scaler"]
     meta_model = model_components["meta_model"]
     feature_names = model_components["feature_names"]
     model_names = model_components["model_names"]
     meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
     # Level 1: Meta-model prediction
+    meta_scaled = meta_scaler.transform(meta_features_df)
+    meta_scaled = pd.DataFrame(meta_scaled, columns=meta_features_df.columns)
+    final_pred = meta_model.predict(meta_scaled)[0]
+    final_prob = meta_model.predict_proba(meta_scaled)[:, 1][0]
     return {
         "predicted_label": int(final_pred),
     }
+def sanitize_features(features_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Sanitize features by replacing -1 values with sensible defaults.
+    This handles cases where feature extraction partially failed but still
+    returned some valid features (e.g., when Playwright successfully fetches
+    a page but some individual feature extractions fail).
+    Args:
+        features_dict: Dictionary of extracted features
+    Returns:
+        dict: Sanitized features with -1 values replaced
+    """
+    sanitized = features_dict.copy()
+    # Define default values for different feature types
+    # Binary features (has_*) default to 0 (not present)
+    # Count features (number_of_*) default to 0
+    # Length features (length_of_*) default to 0
+    for key, value in sanitized.items():
+        if value == -1:
+            # Replace -1 with 0 for all feature types
+            # This is conservative: assumes missing features are not present
+            sanitized[key] = 0
+            logger.debug(f"Sanitized feature '{key}': -1 -> 0")
+    return sanitized
 def predict_url(url: str) -> Dict[str, Any]:
     """
     Main prediction function that takes a raw URL and returns prediction.
     This function:
     1. Loads the model (cached after first load)
     2. Extracts features from the URL using url_feature_extractor
+    3. Sanitizes features (replaces -1 with 0)
+    4. Makes prediction using the stacking model
     Args:
         url: Raw URL string to analyze
                 "error": "Failed to extract features - URL may be unreachable"
             }
+        # Sanitize features: replace -1 values with sensible defaults
+        # This allows partial feature extraction to still produce predictions
+        failed_features = sum(1 for v in features_dict.values() if v == -1)
+        if failed_features > 0:
+            logger.warning(f"⚠ {failed_features} features failed extraction, using defaults")
+            features_dict = sanitize_features(features_dict)
         # Make prediction
         logger.info("Making prediction...")
         prediction_result = predict_from_features(features_dict, model_components)
         # Load model components
         model_components = load_model()
         base_models = model_components["base_models"]
+        meta_scaler = model_components["meta_scaler"]
         meta_model = model_components["meta_model"]
         feature_names = model_components["feature_names"]
         model_names = model_components["model_names"]
         # Extract features from URL
         logger.info(f"Extracting features for update from URL: {url}")
         features_dict = extract_features(url)
             logger.warning(f"Feature extraction failed for URL update: {url}")
             return None, False
+        # Sanitize features: replace -1 values with sensible defaults
+        failed_features = sum(1 for v in features_dict.values() if v == -1)
+        if failed_features > 0:
+            logger.warning(f"⚠ {failed_features} features failed extraction during update, using defaults")
+            features_dict = sanitize_features(features_dict)
         # Convert to DataFrame and ensure proper ordering
         X = pd.DataFrame([features_dict])
         missing_cols = set(feature_names) - set(X.columns)
         if missing_cols:
             raise ValueError(f"Missing required features: {missing_cols}")
         X = X[feature_names]
         # Generate meta-features using base models (probability outputs)
         meta_features = np.zeros((X.shape[0], len(base_models)))
         for idx, (model_name, model) in enumerate(base_models.items()):
             meta_features[:, idx] = model.predict_proba(X)[:, 1]
         meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
+        # Scale meta-features
+        meta_scaled = meta_scaler.transform(meta_features_df)
         # Update the SGD meta model using partial_fit
         logger.info(f"Updating meta model with partial_fit for label: {true_label}")
+        meta_model.partial_fit(meta_scaled, [true_label], classes=[0, 1])
         # Update the cached model with the new meta model
         global _model_cache
         if _model_cache is not None:
             _model_cache["meta_model"] = meta_model
         # Save the updated model to disk
         save_updated_model(model_components, meta_model)
         logger.info(f"✅ Model updated successfully for URL: {url}")
+        return meta_scaled[0], True
     except Exception as e:
         logger.error(f"❌ Failed to update model: {str(e)}")
 def save_updated_model(model_components: Dict[str, Any], updated_meta_model) -> None:
     """
     Save the updated model components to disk.
     Args:
         model_components: Dictionary containing model components
         updated_meta_model: The updated SGD meta model
     """
     try:
         model_path = get_model_path()
         # Create updated model data
         updated_model_data = {
             "base_models": model_components["base_models"],
+            "meta_scaler": model_components["meta_scaler"],
+            "scaler_name": model_components.get("scaler_name", "Unknown"),
             "meta_model": updated_meta_model,  # Use the updated meta model
             "feature_names": model_components["feature_names"],
             "model_names": model_components["model_names"]
         }
         # Save to disk
         joblib.dump(updated_model_data, model_path)
         logger.info(f"✅ Updated model saved to: {model_path}")
     except Exception as e:
         logger.error(f"❌ Failed to save updated model: {str(e)}")
         raise