Perth0603 commited on
Commit
40f61ad
·
verified ·
1 Parent(s): 211af6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -8
app.py CHANGED
@@ -75,7 +75,7 @@ if NLTK_AVAILABLE:
75
  'alert', 'warning', 'action required', 'unusual activity', 'compromised'
76
  }
77
 
78
- # Consolidated lookalike characters dictionary (used once throughout)
79
  LOOKALIKE_CHARS = {
80
  # Cyrillic
81
  'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
@@ -90,11 +90,11 @@ LOOKALIKE_CHARS = {
90
 
91
  BRAND_NAMES = [
92
  "facebook", "linkedin", "paypal", "google", "amazon", "apple",
93
- "microsoft", "instagram", "netflix", "twitter", "whatsapp"
94
  ]
95
 
96
  SUSPICIOUS_KEYWORDS = ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]
97
- SUSPICIOUS_TLDS = {"tk", "ml", "ga", "cf", "gq", "xyz", "top", "buzz", "icu", "fit", "rest", "work", "click", "country", "zip"}
98
 
99
 
100
  app = FastAPI(
@@ -425,7 +425,8 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
425
  def _ratio_digits(txt: str) -> float:
426
  if not txt:
427
  return 0.0
428
- return sum(c.isdigit() for c in txt) / len(txt)
 
429
 
430
  out["sld_digit_ratio"] = sld_series.apply(_ratio_digits)
431
  out["sld_entropy"] = sld_series.apply(_shannon_entropy)
@@ -435,10 +436,11 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
435
  if not host:
436
  return 0.0
437
  sld = host.split(".")[-2] if "." in host else host
438
- return max(
439
- SequenceMatcher(None, host, brand).ratio(),
440
- SequenceMatcher(None, sld, brand).ratio()
441
- ) for brand in BRAND_NAMES
 
442
 
443
  out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
444
  out["like_facebook"] = hosts.apply(
 
75
  'alert', 'warning', 'action required', 'unusual activity', 'compromised'
76
  }
77
 
78
+ # Consolidated lookalike characters dictionary
79
  LOOKALIKE_CHARS = {
80
  # Cyrillic
81
  'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
 
90
 
91
  BRAND_NAMES = [
92
  "facebook", "linkedin", "paypal", "google", "amazon", "apple",
93
+ "microsoft", "instagram", "netflix", "twitter", "whatsapp", "bank", "hsbc", "yahoo", "outlook"
94
  ]
95
 
96
  SUSPICIOUS_KEYWORDS = ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]
97
+ SUSPICIOUS_TLDS = {"tk", "ml", "ga", "cf", "gq", "xyz", "top", "buzz", "icu", "fit", "rest", "work", "click", "country", "zip", "ru", "kim", "support", "ltd"}
98
 
99
 
100
  app = FastAPI(
 
425
  def _ratio_digits(txt: str) -> float:
426
  if not txt:
427
  return 0.0
428
+ digits = sum(c.isdigit() for c in txt)
429
+ return float(digits) / float(len(txt))
430
 
431
  out["sld_digit_ratio"] = sld_series.apply(_ratio_digits)
432
  out["sld_entropy"] = sld_series.apply(_shannon_entropy)
 
436
  if not host:
437
  return 0.0
438
  sld = host.split(".")[-2] if "." in host else host
439
+ similarities = []
440
+ for brand in BRAND_NAMES:
441
+ similarities.append(SequenceMatcher(None, host, brand).ratio())
442
+ similarities.append(SequenceMatcher(None, sld, brand).ratio())
443
+ return max(similarities) if similarities else 0.0
444
 
445
  out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
446
  out["like_facebook"] = hosts.apply(