Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Oct 4, 2025

Commit

3c180d3

verified ·

1 Parent(s): 639276e

Upload app.py

Browse files

Files changed (1) hide show

app.py +45 -2

app.py CHANGED Viewed

@@ -213,7 +213,7 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
     common_brands = [
         "facebook", "google", "youtube", "apple", "microsoft",
         "paypal", "amazon", "netflix", "instagram", "whatsapp",
-        "tiktok", "twitter", "telegram", "bank", "login"
     ]
     def _max_brand_similarity(host: str) -> float:
@@ -346,7 +346,50 @@ def predict_url(payload: PredictUrlPayload):
                         "url_col": url_col,
                     }
-        # Mirror inference.py exactly for probability of class 1
         feats = _engineer_features([url_str], feature_cols)
         if model_type == "xgboost_bst":
             if xgb is None:

     common_brands = [
         "facebook", "google", "youtube", "apple", "microsoft",
         "paypal", "amazon", "netflix", "instagram", "whatsapp",
+        "tiktok", "twitter", "telegram", "linkedin", "bank", "login"
     ]
     def _max_brand_similarity(host: str) -> float:
                         "url_col": url_col,
                     }
+        # Typosquat guard: if SLD is very similar to common brands but not exact,
+        # short-circuit as phishing with high confidence to match notebook demos.
+        try:
+            s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
+            s_sld = s_host.split(".")[-2] if "." in s_host else s_host
+            def _normalize_brand(s: str) -> str:
+                return re.sub(r"[^a-z]", "", s.lower())
+            s_clean = _normalize_brand(s_sld)
+            brands = [
+                "facebook","linkedin","paypal","google","amazon","apple",
+                "microsoft","instagram","netflix","twitter","whatsapp"
+            ]
+            def _sim(a: str, b: str) -> float:
+                try:
+                    from rapidfuzz import fuzz  # type: ignore
+                    return float(fuzz.ratio(a, b)) / 100.0
+                except Exception:
+                    from difflib import SequenceMatcher
+                    return SequenceMatcher(None, a, b).ratio()
+            is_exact = any(s_clean == _normalize_brand(b) for b in brands)
+            if s_clean and not is_exact:
+                best = 0.0
+                for b in brands:
+                    best = max(best, _sim(s_clean, _normalize_brand(b)))
+                if best >= 0.82:
+                    phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
+                    label = "PHISH"
+                    predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
+                    phish_proba = 0.90
+                    score = phish_proba
+                    return {
+                        "label": label,
+                        "predicted_label": int(predicted_label),
+                        "score": float(score),
+                        "phishing_probability": float(phish_proba),
+                        "backend": "typosquat_guard",
+                        "threshold": 0.5,
+                        "url_col": url_col,
+                        "rule": "typosquat_guard",
+                    }
+        except Exception:
+            pass
+        # Mirror inference flow for probability of class 1
         feats = _engineer_features([url_str], feature_cols)
         if model_type == "xgboost_bst":
             if xgb is None: