Spaces:

ExistedYear
/

smishing_detector_api

Paused

App Files Files Community

ExistedYear commited on about 23 hours ago

Commit

348b3a4

1 Parent(s): 2df84b2

GSB priority in green channel

Browse files

Files changed (2) hide show

smishing_detector/predictor.py +35 -6
smishing_detector/utils/data_loader.py +1 -2

smishing_detector/predictor.py CHANGED Viewed

@@ -293,35 +293,64 @@ class SmishingPredictor:
             risk = "low"
             return label, risk, final_prob, False, ""
-        # Build unified legit domain set (safe_browsing + data_loader)
-        legit_domains = set()
         try:
             from utils.safe_browsing import get_checker
-            legit_domains |= get_checker().fallback_legit_domains
         except Exception:
             pass
         try:
             from utils.data_loader import LEGIT_DOMAINS
-            legit_domains |= LEGIT_DOMAINS
         except Exception:
             pass
         has_url = bool(uf.get("has_url"))
         has_suspicious = bool(uf.get("suspicious_tld") or uf.get("has_ip_url"))
-        # Case 1: Message has URLs — check if ALL are legit
         if has_url and not has_suspicious:
             try:
                 import tldextract
                 urls = extract_urls(message)
                 if urls:
                     all_legit = True
                     for url in urls:
                         ext = tldextract.extract(url)
                         full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
-                        if full_domain not in legit_domains:
                             all_legit = False
                             break
                     if all_legit:
                         return "safe", "low", min(final_prob, 0.40), True, \
                             "all URLs verified as legitimate domains"

             risk = "low"
             return label, risk, final_prob, False, ""
+        # Build static legit domain set (fallback whitelist)
+        static_legit = set()
         try:
             from utils.safe_browsing import get_checker
+            static_legit |= get_checker().fallback_legit_domains
         except Exception:
             pass
         try:
             from utils.data_loader import LEGIT_DOMAINS
+            static_legit |= LEGIT_DOMAINS
         except Exception:
             pass
         has_url = bool(uf.get("has_url"))
         has_suspicious = bool(uf.get("suspicious_tld") or uf.get("has_ip_url"))
+        # Case 1: Message has URLs — check each via GSB then static whitelist
         if has_url and not has_suspicious:
             try:
                 import tldextract
                 urls = extract_urls(message)
                 if urls:
+                    checker = None
+                    try:
+                        from utils.safe_browsing import get_checker as _gc
+                        checker = _gc()
+                    except Exception:
+                        pass
                     all_legit = True
+                    malicious_found = False
                     for url in urls:
                         ext = tldextract.extract(url)
                         full_domain = f"{ext.domain}.{ext.suffix}".lower().strip(".")
+                        if not full_domain:
+                            continue
+                        # GSB first (cached, live lookup if needed)
+                        status = "unknown"
+                        if checker:
+                            try:
+                                status = checker.check_domain_status(full_domain)
+                            except Exception:
+                                pass
+                        if status == "known_malicious":
+                            malicious_found = True
+                            break
+                        elif status == "known_safe":
+                            continue  # GSB verified safe
+                        elif full_domain in static_legit:
+                            continue  # static whitelist
+                        else:
                             all_legit = False
                             break
+                    if malicious_found:
+                        return "spam", "high", max(final_prob, 0.90), False, ""
                     if all_legit:
                         return "safe", "low", min(final_prob, 0.40), True, \
                             "all URLs verified as legitimate domains"

smishing_detector/utils/data_loader.py CHANGED Viewed

@@ -53,8 +53,7 @@ LEGIT_DOMAINS = {
 }
 TRUSTED_TLDS = {
-    "com", "org", "net", "edu", "gov", "gov.in", "co.uk", "au",
-    "ca", "de", "fr", "in", "co.in", "sbi",
 }
 URL_FEATURE_COLS = [

 }
 TRUSTED_TLDS = {
+    "gov", "gov.in", "edu", "sbi",
 }
 URL_FEATURE_COLS = [