Commit Β·
abdf9b3
1
Parent(s): 348b3a4
balls
Browse files- smishing_detector/predictor.py +45 -67
smishing_detector/predictor.py
CHANGED
|
@@ -265,56 +265,38 @@ class SmishingPredictor:
|
|
| 265 |
spam_rule: float, ham_rule: float,
|
| 266 |
uf: dict) -> Tuple[str, str, float, bool, str]:
|
| 267 |
"""
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
This project detects smishing (scam SMS), NOT general spam.
|
| 271 |
-
A message like "Congratulations! Free 5G SIM upgrade. Visit your nearest
|
| 272 |
-
Airtel store" is promotional spam, not a scam β it has no phishing URL,
|
| 273 |
-
no threats, no urgency to click a link.
|
| 274 |
-
|
| 275 |
-
Logic:
|
| 276 |
-
1. Hard scam signals (spam_rule >= 0.55) β NEVER clear. These have
|
| 277 |
-
explicit phishing patterns like "account suspended", "KYC update",
|
| 278 |
-
"digital arrest" with suspicious URLs.
|
| 279 |
-
2. Soft spam (spam_rule < 0.55) + ALL URLs are verified legit domains
|
| 280 |
-
β GREEN CHANNEL β downgrade to "safe" (it's just an ad/promo).
|
| 281 |
-
3. Soft spam + NO URLs at all + ham patterns detected β also GREEN
|
| 282 |
-
CHANNEL (legit transactional or brand notification).
|
| 283 |
-
|
| 284 |
-
Returns: (label, risk, final_prob, green_cleared, green_reason)
|
| 285 |
-
"""
|
| 286 |
-
# Never green-channel if hard scam patterns are present
|
| 287 |
-
if spam_rule >= 0.55:
|
| 288 |
-
risk = "high" if final_prob >= 0.75 else "medium" if final_prob >= self.threshold else "low"
|
| 289 |
-
return label, risk, final_prob, False, ""
|
| 290 |
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
risk = "low"
|
| 294 |
-
return label, risk, final_prob, False, ""
|
| 295 |
-
|
| 296 |
-
# Build static legit domain set (fallback whitelist)
|
| 297 |
-
static_legit = set()
|
| 298 |
-
try:
|
| 299 |
-
from utils.safe_browsing import get_checker
|
| 300 |
-
static_legit |= get_checker().fallback_legit_domains
|
| 301 |
-
except Exception:
|
| 302 |
-
pass
|
| 303 |
-
try:
|
| 304 |
-
from utils.data_loader import LEGIT_DOMAINS
|
| 305 |
-
static_legit |= LEGIT_DOMAINS
|
| 306 |
-
except Exception:
|
| 307 |
-
pass
|
| 308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
has_url = bool(uf.get("has_url"))
|
| 310 |
-
|
| 311 |
|
| 312 |
-
#
|
| 313 |
-
if has_url
|
| 314 |
try:
|
| 315 |
import tldextract
|
| 316 |
urls = extract_urls(message)
|
| 317 |
if urls:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
checker = None
|
| 319 |
try:
|
| 320 |
from utils.safe_browsing import get_checker as _gc
|
|
@@ -322,15 +304,14 @@ class SmishingPredictor:
|
|
| 322 |
except Exception:
|
| 323 |
pass
|
| 324 |
|
| 325 |
-
|
| 326 |
-
malicious_found = False
|
| 327 |
for url in urls:
|
| 328 |
ext = tldextract.extract(url)
|
| 329 |
full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
|
| 330 |
if not full_domain:
|
| 331 |
continue
|
| 332 |
|
| 333 |
-
# GSB first
|
| 334 |
status = "unknown"
|
| 335 |
if checker:
|
| 336 |
try:
|
|
@@ -339,36 +320,33 @@ class SmishingPredictor:
|
|
| 339 |
pass
|
| 340 |
|
| 341 |
if status == "known_malicious":
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
|
|
|
| 355 |
return "safe", "low", min(final_prob, 0.40), True, \
|
| 356 |
"all URLs verified as legitimate domains"
|
| 357 |
except Exception:
|
| 358 |
pass
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
|
|
|
|
|
|
| 363 |
return "safe", "low", min(final_prob, 0.35), True, \
|
| 364 |
"no URLs + legitimate transactional patterns detected"
|
| 365 |
|
| 366 |
-
# Case 3: Has legit domain URL + ham patterns dominate spam patterns
|
| 367 |
-
if has_url and uf.get("has_legit_domain") and ham_rule > spam_rule:
|
| 368 |
-
return "safe", "low", min(final_prob, 0.42), True, \
|
| 369 |
-
"legit domain URL + stronger ham signals than scam signals"
|
| 370 |
-
|
| 371 |
-
risk = "high" if final_prob >= 0.75 else "medium" if final_prob >= self.threshold else "low"
|
| 372 |
return label, risk, final_prob, False, ""
|
| 373 |
|
| 374 |
def predict(self, message: str) -> dict:
|
|
|
|
| 265 |
spam_rule: float, ham_rule: float,
|
| 266 |
uf: dict) -> Tuple[str, str, float, bool, str]:
|
| 267 |
"""
|
| 268 |
+
Unified URL-aware channel checking.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
+
For EVERY message that has URLs, check ALL of them against
|
| 271 |
+
GSB first, then static whitelist:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
1. ANY URL β known_malicious => SPAM (escalate)
|
| 274 |
+
2. ALL URLs β known_safe => SAFE (green channel)
|
| 275 |
+
3. ANY URL β unverified => keep model verdict (no override)
|
| 276 |
+
4. No URLs + ham patterns => SAFE (promo/ad without links)
|
| 277 |
+
"""
|
| 278 |
has_url = bool(uf.get("has_url"))
|
| 279 |
+
risk = "high" if final_prob >= 0.75 else "medium" if final_prob >= self.threshold else "low"
|
| 280 |
|
| 281 |
+
# ββ URL checking (runs for ALL messages with URLs) ββββββββββββββββββ
|
| 282 |
+
if has_url:
|
| 283 |
try:
|
| 284 |
import tldextract
|
| 285 |
urls = extract_urls(message)
|
| 286 |
if urls:
|
| 287 |
+
# static whitelist
|
| 288 |
+
static_legit = set()
|
| 289 |
+
try:
|
| 290 |
+
from utils.safe_browsing import get_checker
|
| 291 |
+
static_legit |= get_checker().fallback_legit_domains
|
| 292 |
+
except Exception:
|
| 293 |
+
pass
|
| 294 |
+
try:
|
| 295 |
+
from utils.data_loader import LEGIT_DOMAINS
|
| 296 |
+
static_legit |= LEGIT_DOMAINS
|
| 297 |
+
except Exception:
|
| 298 |
+
pass
|
| 299 |
+
|
| 300 |
checker = None
|
| 301 |
try:
|
| 302 |
from utils.safe_browsing import get_checker as _gc
|
|
|
|
| 304 |
except Exception:
|
| 305 |
pass
|
| 306 |
|
| 307 |
+
all_safe = True
|
|
|
|
| 308 |
for url in urls:
|
| 309 |
ext = tldextract.extract(url)
|
| 310 |
full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
|
| 311 |
if not full_domain:
|
| 312 |
continue
|
| 313 |
|
| 314 |
+
# 1. GSB first
|
| 315 |
status = "unknown"
|
| 316 |
if checker:
|
| 317 |
try:
|
|
|
|
| 320 |
pass
|
| 321 |
|
| 322 |
if status == "known_malicious":
|
| 323 |
+
return "spam", "high", max(final_prob, 0.90), False, ""
|
| 324 |
+
if status == "known_safe":
|
| 325 |
+
continue
|
| 326 |
+
if full_domain in static_legit:
|
| 327 |
+
continue
|
| 328 |
+
# unverified β don't green-channel
|
| 329 |
+
all_safe = False
|
| 330 |
+
break
|
| 331 |
+
|
| 332 |
+
if all_safe:
|
| 333 |
+
if label == "spam" and spam_rule >= 0.35:
|
| 334 |
+
# message looks scammy but all URLs are legit β cap at medium
|
| 335 |
+
return "spam", "medium", min(final_prob, 0.55), True, \
|
| 336 |
+
"URLs verified legitimate but message content is suspicious"
|
| 337 |
return "safe", "low", min(final_prob, 0.40), True, \
|
| 338 |
"all URLs verified as legitimate domains"
|
| 339 |
except Exception:
|
| 340 |
pass
|
| 341 |
|
| 342 |
+
# URLs exist but at least one is unverified β keep model verdict
|
| 343 |
+
return label, risk, final_prob, False, ""
|
| 344 |
+
|
| 345 |
+
# ββ No URLs: check for harmless promo/ad patterns βββββββββββββββββββ
|
| 346 |
+
if ham_rule >= 0.35 and spam_rule < 0.35 and label == "spam":
|
| 347 |
return "safe", "low", min(final_prob, 0.35), True, \
|
| 348 |
"no URLs + legitimate transactional patterns detected"
|
| 349 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
return label, risk, final_prob, False, ""
|
| 351 |
|
| 352 |
def predict(self, message: str) -> dict:
|