Rasel Santillan commited on
Commit
b5d7eef
Β·
1 Parent(s): 8a9ac80

Describe your changes

Browse files
Files changed (2) hide show
  1. categorization.py +24 -17
  2. model/model.py +77 -17
categorization.py CHANGED
@@ -14,7 +14,7 @@ class RiskCategory(str, Enum):
14
  SAFE = "Safe"
15
  LOW = "Low"
16
  MODERATE = "Moderate"
17
- HIGH = "Dangerous"
18
  CRITICAL = "Critical"
19
 
20
 
@@ -26,34 +26,41 @@ class BinaryClassification(str, Enum):
26
 
27
  # Risk category thresholds (score is 0-100 scale)
28
  RISK_THRESHOLDS = {
29
- RiskCategory.SAFE: (0, 25), # score < 25
30
- RiskCategory.LOW: (25, 50), # 25 <= score < 50
31
- RiskCategory.MODERATE: (50, 70), # 50 <= score < 70
32
- RiskCategory.HIGH: (70, 85), # 70 <= score < 85
33
- RiskCategory.CRITICAL: (85, 101), # score >= 85
34
  }
35
 
36
  # Binary classification threshold
37
- PHISHING_THRESHOLD = 70 # score >= 70 is classified as Phishing
38
 
39
 
40
  def get_risk_category(phish_probability_score: float) -> RiskCategory:
41
  """
42
  Determine the risk category based on phishing probability score.
43
-
 
 
 
 
 
 
 
44
  Args:
45
  phish_probability_score: Phishing probability score (0-100 scale)
46
-
47
  Returns:
48
  RiskCategory: The corresponding risk category
49
  """
50
- if phish_probability_score < 25:
51
  return RiskCategory.SAFE
52
- elif phish_probability_score < 50:
53
  return RiskCategory.LOW
54
- elif phish_probability_score < 70:
55
  return RiskCategory.MODERATE
56
- elif phish_probability_score < 85:
57
  return RiskCategory.HIGH
58
  else:
59
  return RiskCategory.CRITICAL
@@ -62,14 +69,14 @@ def get_risk_category(phish_probability_score: float) -> RiskCategory:
62
  def get_binary_classification(phish_probability_score: float) -> BinaryClassification:
63
  """
64
  Determine the binary classification based on phishing probability score.
65
-
66
  Args:
67
  phish_probability_score: Phishing probability score (0-100 scale)
68
-
69
  Returns:
70
- BinaryClassification: Legitimate if score < 70, Phishing otherwise
71
  """
72
- if phish_probability_score < PHISHING_THRESHOLD:
73
  return BinaryClassification.LEGITIMATE
74
  else:
75
  return BinaryClassification.PHISHING
 
14
  SAFE = "Safe"
15
  LOW = "Low"
16
  MODERATE = "Moderate"
17
+ HIGH = "High"
18
  CRITICAL = "Critical"
19
 
20
 
 
26
 
27
  # Risk category thresholds (score is 0-100 scale)
28
  RISK_THRESHOLDS = {
29
+ RiskCategory.SAFE: (0, 30), # 0-30%: score <= 30
30
+ RiskCategory.LOW: (30, 50), # 31-50%: 30 < score <= 50
31
+ RiskCategory.MODERATE: (50, 70), # 51-70%: 50 < score <= 70
32
+ RiskCategory.HIGH: (70, 85), # 71-85%: 70 < score <= 85
33
+ RiskCategory.CRITICAL: (85, 101), # 86-100%: score > 85
34
  }
35
 
36
  # Binary classification threshold
37
+ PHISHING_THRESHOLD = 70 # score > 70 is classified as Phishing
38
 
39
 
40
  def get_risk_category(phish_probability_score: float) -> RiskCategory:
41
  """
42
  Determine the risk category based on phishing probability score.
43
+
44
+ Thresholds:
45
+ - Safe: 0-30%
46
+ - Low: 31-50%
47
+ - Moderate: 51-70%
48
+ - High: 71-85%
49
+ - Critical: 86-100%
50
+
51
  Args:
52
  phish_probability_score: Phishing probability score (0-100 scale)
53
+
54
  Returns:
55
  RiskCategory: The corresponding risk category
56
  """
57
+ if phish_probability_score <= 30:
58
  return RiskCategory.SAFE
59
+ elif phish_probability_score <= 50:
60
  return RiskCategory.LOW
61
+ elif phish_probability_score <= 70:
62
  return RiskCategory.MODERATE
63
+ elif phish_probability_score <= 85:
64
  return RiskCategory.HIGH
65
  else:
66
  return RiskCategory.CRITICAL
 
69
  def get_binary_classification(phish_probability_score: float) -> BinaryClassification:
70
  """
71
  Determine the binary classification based on phishing probability score.
72
+
73
  Args:
74
  phish_probability_score: Phishing probability score (0-100 scale)
75
+
76
  Returns:
77
+ BinaryClassification: Legitimate if score <= 70, Phishing otherwise
78
  """
79
+ if phish_probability_score <= PHISHING_THRESHOLD:
80
  return BinaryClassification.LEGITIMATE
81
  else:
82
  return BinaryClassification.PHISHING
model/model.py CHANGED
@@ -45,6 +45,8 @@ def load_model() -> Dict[str, Any]:
45
  Returns:
46
  dict: Dictionary containing model components:
47
  - base_models: Dictionary of base models
 
 
48
  - meta_model: Final meta model
49
  - feature_names: List of feature names
50
  - model_names: List of base model names
@@ -69,12 +71,15 @@ def load_model() -> Dict[str, Any]:
69
  # Cache the model
70
  _model_cache = {
71
  "base_models": model_data["base_models"],
 
 
72
  "meta_model": model_data["meta_model"],
73
  "feature_names": model_data["feature_names"],
74
  "model_names": model_data["model_names"]
75
  }
76
 
77
- logger.info("βœ… Model loaded successfully")
 
78
  return _model_cache
79
 
80
  except Exception as e:
@@ -97,6 +102,7 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
97
  ValueError: If required features are missing
98
  """
99
  base_models = model_components["base_models"]
 
100
  meta_model = model_components["meta_model"]
101
  feature_names = model_components["feature_names"]
102
  model_names = model_components["model_names"]
@@ -120,8 +126,11 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
120
  meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
121
 
122
  # Level 1: Meta-model prediction
123
- final_pred = meta_model.predict(meta_features_df)[0]
124
- final_prob = meta_model.predict_proba(meta_features_df)[:, 1][0]
 
 
 
125
 
126
  return {
127
  "predicted_label": int(final_pred),
@@ -129,6 +138,37 @@ def predict_from_features(features_dict: Dict[str, Any], model_components: Dict[
129
  }
130
 
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def predict_url(url: str) -> Dict[str, Any]:
133
  """
134
  Main prediction function that takes a raw URL and returns prediction.
@@ -136,7 +176,8 @@ def predict_url(url: str) -> Dict[str, Any]:
136
  This function:
137
  1. Loads the model (cached after first load)
138
  2. Extracts features from the URL using url_feature_extractor
139
- 3. Makes prediction using the stacking model
 
140
 
141
  Args:
142
  url: Raw URL string to analyze
@@ -173,6 +214,13 @@ def predict_url(url: str) -> Dict[str, Any]:
173
  "error": "Failed to extract features - URL may be unreachable"
174
  }
175
 
 
 
 
 
 
 
 
176
  # Make prediction
177
  logger.info("Making prediction...")
178
  prediction_result = predict_from_features(features_dict, model_components)
@@ -223,10 +271,11 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
223
  # Load model components
224
  model_components = load_model()
225
  base_models = model_components["base_models"]
 
226
  meta_model = model_components["meta_model"]
227
  feature_names = model_components["feature_names"]
228
  model_names = model_components["model_names"]
229
-
230
  # Extract features from URL
231
  logger.info(f"Extracting features for update from URL: {url}")
232
  features_dict = extract_features(url)
@@ -236,34 +285,43 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
236
  logger.warning(f"Feature extraction failed for URL update: {url}")
237
  return None, False
238
 
 
 
 
 
 
 
239
  # Convert to DataFrame and ensure proper ordering
240
  X = pd.DataFrame([features_dict])
241
  missing_cols = set(feature_names) - set(X.columns)
242
  if missing_cols:
243
  raise ValueError(f"Missing required features: {missing_cols}")
244
  X = X[feature_names]
245
-
246
  # Generate meta-features using base models (probability outputs)
247
  meta_features = np.zeros((X.shape[0], len(base_models)))
248
  for idx, (model_name, model) in enumerate(base_models.items()):
249
  meta_features[:, idx] = model.predict_proba(X)[:, 1]
250
-
251
  meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
252
-
 
 
 
253
  # Update the SGD meta model using partial_fit
254
  logger.info(f"Updating meta model with partial_fit for label: {true_label}")
255
- meta_model.partial_fit(meta_features_df, [true_label], classes=[0, 1])
256
-
257
  # Update the cached model with the new meta model
258
  global _model_cache
259
  if _model_cache is not None:
260
  _model_cache["meta_model"] = meta_model
261
-
262
  # Save the updated model to disk
263
  save_updated_model(model_components, meta_model)
264
-
265
  logger.info(f"βœ… Model updated successfully for URL: {url}")
266
- return meta_features_df.values[0], True
267
 
268
  except Exception as e:
269
  logger.error(f"❌ Failed to update model: {str(e)}")
@@ -273,26 +331,28 @@ def get_meta_features_and_update(url: str, true_label: int) -> Tuple[Optional[np
273
  def save_updated_model(model_components: Dict[str, Any], updated_meta_model) -> None:
274
  """
275
  Save the updated model components to disk.
276
-
277
  Args:
278
  model_components: Dictionary containing model components
279
  updated_meta_model: The updated SGD meta model
280
  """
281
  try:
282
  model_path = get_model_path()
283
-
284
  # Create updated model data
285
  updated_model_data = {
286
  "base_models": model_components["base_models"],
 
 
287
  "meta_model": updated_meta_model, # Use the updated meta model
288
  "feature_names": model_components["feature_names"],
289
  "model_names": model_components["model_names"]
290
  }
291
-
292
  # Save to disk
293
  joblib.dump(updated_model_data, model_path)
294
  logger.info(f"βœ… Updated model saved to: {model_path}")
295
-
296
  except Exception as e:
297
  logger.error(f"❌ Failed to save updated model: {str(e)}")
298
  raise
 
45
  Returns:
46
  dict: Dictionary containing model components:
47
  - base_models: Dictionary of base models
48
+ - meta_scaler: Scaler for meta features (RobustScaler or StandardScaler)
49
+ - scaler_name: Name of the scaler used (for logging)
50
  - meta_model: Final meta model
51
  - feature_names: List of feature names
52
  - model_names: List of base model names
 
71
  # Cache the model
72
  _model_cache = {
73
  "base_models": model_data["base_models"],
74
+ "meta_scaler": model_data["meta_scaler"],
75
+ "scaler_name": model_data.get("scaler_name", "Unknown"),
76
  "meta_model": model_data["meta_model"],
77
  "feature_names": model_data["feature_names"],
78
  "model_names": model_data["model_names"]
79
  }
80
 
81
+ scaler_name = _model_cache["scaler_name"]
82
+ logger.info(f"βœ… Model loaded successfully (Meta scaler: {scaler_name})")
83
  return _model_cache
84
 
85
  except Exception as e:
 
102
  ValueError: If required features are missing
103
  """
104
  base_models = model_components["base_models"]
105
+ meta_scaler = model_components["meta_scaler"]
106
  meta_model = model_components["meta_model"]
107
  feature_names = model_components["feature_names"]
108
  model_names = model_components["model_names"]
 
126
  meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
127
 
128
  # Level 1: Meta-model prediction
129
+ meta_scaled = meta_scaler.transform(meta_features_df)
130
+ meta_scaled = pd.DataFrame(meta_scaled, columns=meta_features_df.columns)
131
+
132
+ final_pred = meta_model.predict(meta_scaled)[0]
133
+ final_prob = meta_model.predict_proba(meta_scaled)[:, 1][0]
134
 
135
  return {
136
  "predicted_label": int(final_pred),
 
138
  }
139
 
140
 
141
+ def sanitize_features(features_dict: Dict[str, Any]) -> Dict[str, Any]:
142
+ """
143
+ Sanitize features by replacing -1 values with sensible defaults.
144
+
145
+ This handles cases where feature extraction partially failed but still
146
+ returned some valid features (e.g., when Playwright successfully fetches
147
+ a page but some individual feature extractions fail).
148
+
149
+ Args:
150
+ features_dict: Dictionary of extracted features
151
+
152
+ Returns:
153
+ dict: Sanitized features with -1 values replaced
154
+ """
155
+ sanitized = features_dict.copy()
156
+
157
+ # Define default values for different feature types
158
+ # Binary features (has_*) default to 0 (not present)
159
+ # Count features (number_of_*) default to 0
160
+ # Length features (length_of_*) default to 0
161
+
162
+ for key, value in sanitized.items():
163
+ if value == -1:
164
+ # Replace -1 with 0 for all feature types
165
+ # This is conservative: assumes missing features are not present
166
+ sanitized[key] = 0
167
+ logger.debug(f"Sanitized feature '{key}': -1 -> 0")
168
+
169
+ return sanitized
170
+
171
+
172
  def predict_url(url: str) -> Dict[str, Any]:
173
  """
174
  Main prediction function that takes a raw URL and returns prediction.
 
176
  This function:
177
  1. Loads the model (cached after first load)
178
  2. Extracts features from the URL using url_feature_extractor
179
+ 3. Sanitizes features (replaces -1 with 0)
180
+ 4. Makes prediction using the stacking model
181
 
182
  Args:
183
  url: Raw URL string to analyze
 
214
  "error": "Failed to extract features - URL may be unreachable"
215
  }
216
 
217
+ # Sanitize features: replace -1 values with sensible defaults
218
+ # This allows partial feature extraction to still produce predictions
219
+ failed_features = sum(1 for v in features_dict.values() if v == -1)
220
+ if failed_features > 0:
221
+ logger.warning(f"⚠ {failed_features} features failed extraction, using defaults")
222
+ features_dict = sanitize_features(features_dict)
223
+
224
  # Make prediction
225
  logger.info("Making prediction...")
226
  prediction_result = predict_from_features(features_dict, model_components)
 
271
  # Load model components
272
  model_components = load_model()
273
  base_models = model_components["base_models"]
274
+ meta_scaler = model_components["meta_scaler"]
275
  meta_model = model_components["meta_model"]
276
  feature_names = model_components["feature_names"]
277
  model_names = model_components["model_names"]
278
+
279
  # Extract features from URL
280
  logger.info(f"Extracting features for update from URL: {url}")
281
  features_dict = extract_features(url)
 
285
  logger.warning(f"Feature extraction failed for URL update: {url}")
286
  return None, False
287
 
288
+ # Sanitize features: replace -1 values with sensible defaults
289
+ failed_features = sum(1 for v in features_dict.values() if v == -1)
290
+ if failed_features > 0:
291
+ logger.warning(f"⚠ {failed_features} features failed extraction during update, using defaults")
292
+ features_dict = sanitize_features(features_dict)
293
+
294
  # Convert to DataFrame and ensure proper ordering
295
  X = pd.DataFrame([features_dict])
296
  missing_cols = set(feature_names) - set(X.columns)
297
  if missing_cols:
298
  raise ValueError(f"Missing required features: {missing_cols}")
299
  X = X[feature_names]
300
+
301
  # Generate meta-features using base models (probability outputs)
302
  meta_features = np.zeros((X.shape[0], len(base_models)))
303
  for idx, (model_name, model) in enumerate(base_models.items()):
304
  meta_features[:, idx] = model.predict_proba(X)[:, 1]
305
+
306
  meta_features_df = pd.DataFrame(meta_features, columns=[f"{n}_pred" for n in model_names])
307
+
308
+ # Scale meta-features
309
+ meta_scaled = meta_scaler.transform(meta_features_df)
310
+
311
  # Update the SGD meta model using partial_fit
312
  logger.info(f"Updating meta model with partial_fit for label: {true_label}")
313
+ meta_model.partial_fit(meta_scaled, [true_label], classes=[0, 1])
314
+
315
  # Update the cached model with the new meta model
316
  global _model_cache
317
  if _model_cache is not None:
318
  _model_cache["meta_model"] = meta_model
319
+
320
  # Save the updated model to disk
321
  save_updated_model(model_components, meta_model)
322
+
323
  logger.info(f"βœ… Model updated successfully for URL: {url}")
324
+ return meta_scaled[0], True
325
 
326
  except Exception as e:
327
  logger.error(f"❌ Failed to update model: {str(e)}")
 
331
  def save_updated_model(model_components: Dict[str, Any], updated_meta_model) -> None:
332
  """
333
  Save the updated model components to disk.
334
+
335
  Args:
336
  model_components: Dictionary containing model components
337
  updated_meta_model: The updated SGD meta model
338
  """
339
  try:
340
  model_path = get_model_path()
341
+
342
  # Create updated model data
343
  updated_model_data = {
344
  "base_models": model_components["base_models"],
345
+ "meta_scaler": model_components["meta_scaler"],
346
+ "scaler_name": model_components.get("scaler_name", "Unknown"),
347
  "meta_model": updated_meta_model, # Use the updated meta model
348
  "feature_names": model_components["feature_names"],
349
  "model_names": model_components["model_names"]
350
  }
351
+
352
  # Save to disk
353
  joblib.dump(updated_model_data, model_path)
354
  logger.info(f"βœ… Updated model saved to: {model_path}")
355
+
356
  except Exception as e:
357
  logger.error(f"❌ Failed to save updated model: {str(e)}")
358
  raise