ExistedYear commited on
Commit
abdf9b3
Β·
1 Parent(s): 348b3a4
Files changed (1) hide show
  1. smishing_detector/predictor.py +45 -67
smishing_detector/predictor.py CHANGED
@@ -265,56 +265,38 @@ class SmishingPredictor:
265
  spam_rule: float, ham_rule: float,
266
  uf: dict) -> Tuple[str, str, float, bool, str]:
267
  """
268
- Green Channel β€” distinguishes SCAMS from harmless promotional spam.
269
-
270
- This project detects smishing (scam SMS), NOT general spam.
271
- A message like "Congratulations! Free 5G SIM upgrade. Visit your nearest
272
- Airtel store" is promotional spam, not a scam β€” it has no phishing URL,
273
- no threats, no urgency to click a link.
274
-
275
- Logic:
276
- 1. Hard scam signals (spam_rule >= 0.55) β†’ NEVER clear. These have
277
- explicit phishing patterns like "account suspended", "KYC update",
278
- "digital arrest" with suspicious URLs.
279
- 2. Soft spam (spam_rule < 0.55) + ALL URLs are verified legit domains
280
- β†’ GREEN CHANNEL β†’ downgrade to "safe" (it's just an ad/promo).
281
- 3. Soft spam + NO URLs at all + ham patterns detected β†’ also GREEN
282
- CHANNEL (legit transactional or brand notification).
283
-
284
- Returns: (label, risk, final_prob, green_cleared, green_reason)
285
- """
286
- # Never green-channel if hard scam patterns are present
287
- if spam_rule >= 0.55:
288
- risk = "high" if final_prob >= 0.75 else "medium" if final_prob >= self.threshold else "low"
289
- return label, risk, final_prob, False, ""
290
 
291
- # Only applies to messages that would otherwise be classified as spam
292
- if label != "spam":
293
- risk = "low"
294
- return label, risk, final_prob, False, ""
295
-
296
- # Build static legit domain set (fallback whitelist)
297
- static_legit = set()
298
- try:
299
- from utils.safe_browsing import get_checker
300
- static_legit |= get_checker().fallback_legit_domains
301
- except Exception:
302
- pass
303
- try:
304
- from utils.data_loader import LEGIT_DOMAINS
305
- static_legit |= LEGIT_DOMAINS
306
- except Exception:
307
- pass
308
 
 
 
 
 
 
309
  has_url = bool(uf.get("has_url"))
310
- has_suspicious = bool(uf.get("suspicious_tld") or uf.get("has_ip_url"))
311
 
312
- # Case 1: Message has URLs β€” check each via GSB then static whitelist
313
- if has_url and not has_suspicious:
314
  try:
315
  import tldextract
316
  urls = extract_urls(message)
317
  if urls:
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  checker = None
319
  try:
320
  from utils.safe_browsing import get_checker as _gc
@@ -322,15 +304,14 @@ class SmishingPredictor:
322
  except Exception:
323
  pass
324
 
325
- all_legit = True
326
- malicious_found = False
327
  for url in urls:
328
  ext = tldextract.extract(url)
329
  full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
330
  if not full_domain:
331
  continue
332
 
333
- # GSB first (cached, live lookup if needed)
334
  status = "unknown"
335
  if checker:
336
  try:
@@ -339,36 +320,33 @@ class SmishingPredictor:
339
  pass
340
 
341
  if status == "known_malicious":
342
- malicious_found = True
343
- break
344
- elif status == "known_safe":
345
- continue # GSB verified safe
346
- elif full_domain in static_legit:
347
- continue # static whitelist
348
- else:
349
- all_legit = False
350
- break
351
-
352
- if malicious_found:
353
- return "spam", "high", max(final_prob, 0.90), False, ""
354
- if all_legit:
 
355
  return "safe", "low", min(final_prob, 0.40), True, \
356
  "all URLs verified as legitimate domains"
357
  except Exception:
358
  pass
359
 
360
- # Case 2: No URL + ham patterns present + low spam rule score
361
- # e.g. "Airtel: Congratulations! Eligible for free 5G upgrade. Visit store."
362
- if not has_url and ham_rule >= 0.35 and spam_rule < 0.35:
 
 
363
  return "safe", "low", min(final_prob, 0.35), True, \
364
  "no URLs + legitimate transactional patterns detected"
365
 
366
- # Case 3: Has legit domain URL + ham patterns dominate spam patterns
367
- if has_url and uf.get("has_legit_domain") and ham_rule > spam_rule:
368
- return "safe", "low", min(final_prob, 0.42), True, \
369
- "legit domain URL + stronger ham signals than scam signals"
370
-
371
- risk = "high" if final_prob >= 0.75 else "medium" if final_prob >= self.threshold else "low"
372
  return label, risk, final_prob, False, ""
373
 
374
  def predict(self, message: str) -> dict:
 
265
  spam_rule: float, ham_rule: float,
266
  uf: dict) -> Tuple[str, str, float, bool, str]:
267
  """
268
+ Unified URL-aware channel checking.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
270
+ For EVERY message that has URLs, check ALL of them against
271
+ GSB first, then static whitelist:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
+ 1. ANY URL β†’ known_malicious => SPAM (escalate)
274
+ 2. ALL URLs β†’ known_safe => SAFE (green channel)
275
+ 3. ANY URL β†’ unverified => keep model verdict (no override)
276
+ 4. No URLs + ham patterns => SAFE (promo/ad without links)
277
+ """
278
  has_url = bool(uf.get("has_url"))
279
+ risk = "high" if final_prob >= 0.75 else "medium" if final_prob >= self.threshold else "low"
280
 
281
+ # ── URL checking (runs for ALL messages with URLs) ──────────────────
282
+ if has_url:
283
  try:
284
  import tldextract
285
  urls = extract_urls(message)
286
  if urls:
287
+ # static whitelist
288
+ static_legit = set()
289
+ try:
290
+ from utils.safe_browsing import get_checker
291
+ static_legit |= get_checker().fallback_legit_domains
292
+ except Exception:
293
+ pass
294
+ try:
295
+ from utils.data_loader import LEGIT_DOMAINS
296
+ static_legit |= LEGIT_DOMAINS
297
+ except Exception:
298
+ pass
299
+
300
  checker = None
301
  try:
302
  from utils.safe_browsing import get_checker as _gc
 
304
  except Exception:
305
  pass
306
 
307
+ all_safe = True
 
308
  for url in urls:
309
  ext = tldextract.extract(url)
310
  full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
311
  if not full_domain:
312
  continue
313
 
314
+ # 1. GSB first
315
  status = "unknown"
316
  if checker:
317
  try:
 
320
  pass
321
 
322
  if status == "known_malicious":
323
+ return "spam", "high", max(final_prob, 0.90), False, ""
324
+ if status == "known_safe":
325
+ continue
326
+ if full_domain in static_legit:
327
+ continue
328
+ # unverified β€” don't green-channel
329
+ all_safe = False
330
+ break
331
+
332
+ if all_safe:
333
+ if label == "spam" and spam_rule >= 0.35:
334
+ # message looks scammy but all URLs are legit β†’ cap at medium
335
+ return "spam", "medium", min(final_prob, 0.55), True, \
336
+ "URLs verified legitimate but message content is suspicious"
337
  return "safe", "low", min(final_prob, 0.40), True, \
338
  "all URLs verified as legitimate domains"
339
  except Exception:
340
  pass
341
 
342
+ # URLs exist but at least one is unverified β†’ keep model verdict
343
+ return label, risk, final_prob, False, ""
344
+
345
+ # ── No URLs: check for harmless promo/ad patterns ───────────────────
346
+ if ham_rule >= 0.35 and spam_rule < 0.35 and label == "spam":
347
  return "safe", "low", min(final_prob, 0.35), True, \
348
  "no URLs + legitimate transactional patterns detected"
349
 
 
 
 
 
 
 
350
  return label, risk, final_prob, False, ""
351
 
352
  def predict(self, message: str) -> dict: