Perth0603 commited on
Commit
3c180d3
·
verified ·
1 Parent(s): 639276e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -2
app.py CHANGED
@@ -213,7 +213,7 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
213
  common_brands = [
214
  "facebook", "google", "youtube", "apple", "microsoft",
215
  "paypal", "amazon", "netflix", "instagram", "whatsapp",
216
- "tiktok", "twitter", "telegram", "bank", "login"
217
  ]
218
 
219
  def _max_brand_similarity(host: str) -> float:
@@ -346,7 +346,50 @@ def predict_url(payload: PredictUrlPayload):
346
  "url_col": url_col,
347
  }
348
 
349
- # Mirror inference.py exactly for probability of class 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  feats = _engineer_features([url_str], feature_cols)
351
  if model_type == "xgboost_bst":
352
  if xgb is None:
 
213
  common_brands = [
214
  "facebook", "google", "youtube", "apple", "microsoft",
215
  "paypal", "amazon", "netflix", "instagram", "whatsapp",
216
+ "tiktok", "twitter", "telegram", "linkedin", "bank", "login"
217
  ]
218
 
219
  def _max_brand_similarity(host: str) -> float:
 
346
  "url_col": url_col,
347
  }
348
 
349
+ # Typosquat guard: if SLD is very similar to common brands but not exact,
350
+ # short-circuit as phishing with high confidence to match notebook demos.
351
+ try:
352
+ s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
353
+ s_sld = s_host.split(".")[-2] if "." in s_host else s_host
354
+ def _normalize_brand(s: str) -> str:
355
+ return re.sub(r"[^a-z]", "", s.lower())
356
+ s_clean = _normalize_brand(s_sld)
357
+ brands = [
358
+ "facebook","linkedin","paypal","google","amazon","apple",
359
+ "microsoft","instagram","netflix","twitter","whatsapp"
360
+ ]
361
+ def _sim(a: str, b: str) -> float:
362
+ try:
363
+ from rapidfuzz import fuzz # type: ignore
364
+ return float(fuzz.ratio(a, b)) / 100.0
365
+ except Exception:
366
+ from difflib import SequenceMatcher
367
+ return SequenceMatcher(None, a, b).ratio()
368
+ is_exact = any(s_clean == _normalize_brand(b) for b in brands)
369
+ if s_clean and not is_exact:
370
+ best = 0.0
371
+ for b in brands:
372
+ best = max(best, _sim(s_clean, _normalize_brand(b)))
373
+ if best >= 0.82:
374
+ phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
375
+ label = "PHISH"
376
+ predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
377
+ phish_proba = 0.90
378
+ score = phish_proba
379
+ return {
380
+ "label": label,
381
+ "predicted_label": int(predicted_label),
382
+ "score": float(score),
383
+ "phishing_probability": float(phish_proba),
384
+ "backend": "typosquat_guard",
385
+ "threshold": 0.5,
386
+ "url_col": url_col,
387
+ "rule": "typosquat_guard",
388
+ }
389
+ except Exception:
390
+ pass
391
+
392
+ # Mirror inference flow for probability of class 1
393
  feats = _engineer_features([url_str], feature_cols)
394
  if model_type == "xgboost_bst":
395
  if xgb is None: