Commit ·
348b3a4
1
Parent(s): 2df84b2
GSB priority in green channel
Browse files
smishing_detector/predictor.py
CHANGED
|
@@ -293,35 +293,64 @@ class SmishingPredictor:
|
|
| 293 |
risk = "low"
|
| 294 |
return label, risk, final_prob, False, ""
|
| 295 |
|
| 296 |
-
# Build
|
| 297 |
-
|
| 298 |
try:
|
| 299 |
from utils.safe_browsing import get_checker
|
| 300 |
-
|
| 301 |
except Exception:
|
| 302 |
pass
|
| 303 |
try:
|
| 304 |
from utils.data_loader import LEGIT_DOMAINS
|
| 305 |
-
|
| 306 |
except Exception:
|
| 307 |
pass
|
| 308 |
|
| 309 |
has_url = bool(uf.get("has_url"))
|
| 310 |
has_suspicious = bool(uf.get("suspicious_tld") or uf.get("has_ip_url"))
|
| 311 |
|
| 312 |
-
# Case 1: Message has URLs — check
|
| 313 |
if has_url and not has_suspicious:
|
| 314 |
try:
|
| 315 |
import tldextract
|
| 316 |
urls = extract_urls(message)
|
| 317 |
if urls:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
all_legit = True
|
|
|
|
| 319 |
for url in urls:
|
| 320 |
ext = tldextract.extract(url)
|
| 321 |
full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
|
| 322 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
all_legit = False
|
| 324 |
break
|
|
|
|
|
|
|
|
|
|
| 325 |
if all_legit:
|
| 326 |
return "safe", "low", min(final_prob, 0.40), True, \
|
| 327 |
"all URLs verified as legitimate domains"
|
|
|
|
| 293 |
risk = "low"
|
| 294 |
return label, risk, final_prob, False, ""
|
| 295 |
|
| 296 |
+
# Build static legit domain set (fallback whitelist)
|
| 297 |
+
static_legit = set()
|
| 298 |
try:
|
| 299 |
from utils.safe_browsing import get_checker
|
| 300 |
+
static_legit |= get_checker().fallback_legit_domains
|
| 301 |
except Exception:
|
| 302 |
pass
|
| 303 |
try:
|
| 304 |
from utils.data_loader import LEGIT_DOMAINS
|
| 305 |
+
static_legit |= LEGIT_DOMAINS
|
| 306 |
except Exception:
|
| 307 |
pass
|
| 308 |
|
| 309 |
has_url = bool(uf.get("has_url"))
|
| 310 |
has_suspicious = bool(uf.get("suspicious_tld") or uf.get("has_ip_url"))
|
| 311 |
|
| 312 |
+
# Case 1: Message has URLs — check each via GSB then static whitelist
|
| 313 |
if has_url and not has_suspicious:
|
| 314 |
try:
|
| 315 |
import tldextract
|
| 316 |
urls = extract_urls(message)
|
| 317 |
if urls:
|
| 318 |
+
checker = None
|
| 319 |
+
try:
|
| 320 |
+
from utils.safe_browsing import get_checker as _gc
|
| 321 |
+
checker = _gc()
|
| 322 |
+
except Exception:
|
| 323 |
+
pass
|
| 324 |
+
|
| 325 |
all_legit = True
|
| 326 |
+
malicious_found = False
|
| 327 |
for url in urls:
|
| 328 |
ext = tldextract.extract(url)
|
| 329 |
full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
|
| 330 |
+
if not full_domain:
|
| 331 |
+
continue
|
| 332 |
+
|
| 333 |
+
# GSB first (cached, live lookup if needed)
|
| 334 |
+
status = "unknown"
|
| 335 |
+
if checker:
|
| 336 |
+
try:
|
| 337 |
+
status = checker.check_domain_status(full_domain)
|
| 338 |
+
except Exception:
|
| 339 |
+
pass
|
| 340 |
+
|
| 341 |
+
if status == "known_malicious":
|
| 342 |
+
malicious_found = True
|
| 343 |
+
break
|
| 344 |
+
elif status == "known_safe":
|
| 345 |
+
continue # GSB verified safe
|
| 346 |
+
elif full_domain in static_legit:
|
| 347 |
+
continue # static whitelist
|
| 348 |
+
else:
|
| 349 |
all_legit = False
|
| 350 |
break
|
| 351 |
+
|
| 352 |
+
if malicious_found:
|
| 353 |
+
return "spam", "high", max(final_prob, 0.90), False, ""
|
| 354 |
if all_legit:
|
| 355 |
return "safe", "low", min(final_prob, 0.40), True, \
|
| 356 |
"all URLs verified as legitimate domains"
|
smishing_detector/utils/data_loader.py
CHANGED
|
@@ -53,8 +53,7 @@ LEGIT_DOMAINS = {
|
|
| 53 |
}
|
| 54 |
|
| 55 |
TRUSTED_TLDS = {
|
| 56 |
-
"
|
| 57 |
-
"ca", "de", "fr", "in", "co.in", "sbi",
|
| 58 |
}
|
| 59 |
|
| 60 |
URL_FEATURE_COLS = [
|
|
|
|
| 53 |
}
|
| 54 |
|
| 55 |
TRUSTED_TLDS = {
|
| 56 |
+
"gov", "gov.in", "edu", "sbi",
|
|
|
|
| 57 |
}
|
| 58 |
|
| 59 |
URL_FEATURE_COLS = [
|