ExistedYear commited on
Commit
348b3a4
·
1 Parent(s): 2df84b2

GSB priority in green channel

Browse files
smishing_detector/predictor.py CHANGED
@@ -293,35 +293,64 @@ class SmishingPredictor:
293
  risk = "low"
294
  return label, risk, final_prob, False, ""
295
 
296
- # Build unified legit domain set (safe_browsing + data_loader)
297
- legit_domains = set()
298
  try:
299
  from utils.safe_browsing import get_checker
300
- legit_domains |= get_checker().fallback_legit_domains
301
  except Exception:
302
  pass
303
  try:
304
  from utils.data_loader import LEGIT_DOMAINS
305
- legit_domains |= LEGIT_DOMAINS
306
  except Exception:
307
  pass
308
 
309
  has_url = bool(uf.get("has_url"))
310
  has_suspicious = bool(uf.get("suspicious_tld") or uf.get("has_ip_url"))
311
 
312
- # Case 1: Message has URLs — check if ALL are legit
313
  if has_url and not has_suspicious:
314
  try:
315
  import tldextract
316
  urls = extract_urls(message)
317
  if urls:
 
 
 
 
 
 
 
318
  all_legit = True
 
319
  for url in urls:
320
  ext = tldextract.extract(url)
321
  full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
322
- if full_domain not in legit_domains:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  all_legit = False
324
  break
 
 
 
325
  if all_legit:
326
  return "safe", "low", min(final_prob, 0.40), True, \
327
  "all URLs verified as legitimate domains"
 
293
  risk = "low"
294
  return label, risk, final_prob, False, ""
295
 
296
+ # Build static legit domain set (fallback whitelist)
297
+ static_legit = set()
298
  try:
299
  from utils.safe_browsing import get_checker
300
+ static_legit |= get_checker().fallback_legit_domains
301
  except Exception:
302
  pass
303
  try:
304
  from utils.data_loader import LEGIT_DOMAINS
305
+ static_legit |= LEGIT_DOMAINS
306
  except Exception:
307
  pass
308
 
309
  has_url = bool(uf.get("has_url"))
310
  has_suspicious = bool(uf.get("suspicious_tld") or uf.get("has_ip_url"))
311
 
312
+ # Case 1: Message has URLs — check each via GSB then static whitelist
313
  if has_url and not has_suspicious:
314
  try:
315
  import tldextract
316
  urls = extract_urls(message)
317
  if urls:
318
+ checker = None
319
+ try:
320
+ from utils.safe_browsing import get_checker as _gc
321
+ checker = _gc()
322
+ except Exception:
323
+ pass
324
+
325
  all_legit = True
326
+ malicious_found = False
327
  for url in urls:
328
  ext = tldextract.extract(url)
329
  full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
330
+ if not full_domain:
331
+ continue
332
+
333
+ # GSB first (cached, live lookup if needed)
334
+ status = "unknown"
335
+ if checker:
336
+ try:
337
+ status = checker.check_domain_status(full_domain)
338
+ except Exception:
339
+ pass
340
+
341
+ if status == "known_malicious":
342
+ malicious_found = True
343
+ break
344
+ elif status == "known_safe":
345
+ continue # GSB verified safe
346
+ elif full_domain in static_legit:
347
+ continue # static whitelist
348
+ else:
349
  all_legit = False
350
  break
351
+
352
+ if malicious_found:
353
+ return "spam", "high", max(final_prob, 0.90), False, ""
354
  if all_legit:
355
  return "safe", "low", min(final_prob, 0.40), True, \
356
  "all URLs verified as legitimate domains"
smishing_detector/utils/data_loader.py CHANGED
@@ -53,8 +53,7 @@ LEGIT_DOMAINS = {
53
  }
54
 
55
  TRUSTED_TLDS = {
56
- "com", "org", "net", "edu", "gov", "gov.in", "co.uk", "au",
57
- "ca", "de", "fr", "in", "co.in", "sbi",
58
  }
59
 
60
  URL_FEATURE_COLS = [
 
53
  }
54
 
55
  TRUSTED_TLDS = {
56
+ "gov", "gov.in", "edu", "sbi",
 
57
  }
58
 
59
  URL_FEATURE_COLS = [