Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Oct 4, 2025

Commit

20cb166

verified ·

1 Parent(s): b72253c

Upload app.py

Browse files

Files changed (1) hide show

app.py +95 -1

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import csv
 import re
 import threading
 from typing import Optional, List, Dict, Any
 import joblib
 import numpy as np
@@ -124,6 +125,8 @@ def _read_hosts_from_csv(path: str) -> Dict[str, str]:
 def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
     s = pd.Series(urls, dtype=str)
     out = pd.DataFrame()
     out["url_len"] = s.str.len().fillna(0)
     out["count_dot"] = s.str.count(r"\.")
     out["count_hyphen"] = s.str.count("-")
@@ -139,7 +142,98 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
     out["starts_https"] = s.str.startswith("https").astype(int)
     out["ends_with_exe"] = s.str.endswith(".exe").astype(int)
     out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
-    return out[feature_cols]
 def _load_url_model():

 import re
 import threading
 from typing import Optional, List, Dict, Any
+from difflib import SequenceMatcher
 import joblib
 import numpy as np
 def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
     s = pd.Series(urls, dtype=str)
     out = pd.DataFrame()
+    # Base URL-wide counts used by older models
     out["url_len"] = s.str.len().fillna(0)
     out["count_dot"] = s.str.count(r"\.")
     out["count_hyphen"] = s.str.count("-")
     out["starts_https"] = s.str.startswith("https").astype(int)
     out["ends_with_exe"] = s.str.endswith(".exe").astype(int)
     out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
+    # Host/SLD/TLD derived features used by newer models
+    hosts = s.apply(lambda x: (urlparse(x).hostname or "").lower())
+    out["host_len"] = hosts.str.len().fillna(0)
+    # Subdomain count: number of labels minus 2 (for sld.tld); never below 0
+    label_counts = hosts.str.count(r"\.") + 1
+    sub_count = (label_counts - 2).clip(lower=0)
+    out["subdomain_count"] = sub_count.fillna(0)
+    # TLD and SLD extraction (simple heuristic; handles common cases)
+    parts_series = hosts.str.split(".")
+    tld_series = parts_series.apply(lambda p: p[-1] if len(p) >= 1 else "")
+    sld_series = parts_series.apply(lambda p: p[-2] if len(p) >= 2 else "")
+    # Suspicious TLD flag (expand as needed)
+    suspicious_tlds = {
+        "tk", "ml", "ga", "cf", "gq", "xyz", "top", "buzz", "icu",
+        "fit", "rest", "work", "click", "country", "zip"
+    }
+    out["tld_suspicious"] = tld_series.apply(lambda t: 1 if t.lower() in suspicious_tlds else 0)
+    # Punycode indicator
+    out["has_punycode"] = hosts.str.contains("xn--").astype(int)
+    # SLD stats
+    out["sld_len"] = sld_series.str.len().fillna(0)
+    def _ratio_digits(txt: str) -> float:
+        txt = txt or ""
+        if not txt:
+            return 0.0
+        digits = sum(c.isdigit() for c in txt)
+        return float(digits) / float(len(txt))
+    out["sld_digit_ratio"] = sld_series.apply(_ratio_digits)
+    def _shannon_entropy(txt: str) -> float:
+        txt = txt or ""
+        if not txt:
+            return 0.0
+        counts: Dict[str, int] = {}
+        for ch in txt:
+            counts[ch] = counts.get(ch, 0) + 1
+        total = float(len(txt))
+        entropy = 0.0
+        for n in counts.values():
+            p = n / total
+            entropy -= p * np.log2(p)
+        return float(entropy)
+    out["sld_entropy"] = sld_series.apply(_shannon_entropy)
+    # Brand similarity features (lightweight; stdlib only)
+    common_brands = [
+        "facebook", "google", "youtube", "apple", "microsoft",
+        "paypal", "amazon", "netflix", "instagram", "whatsapp",
+        "tiktok", "twitter", "telegram", "bank", "login"
+    ]
+    def _max_brand_similarity(host: str) -> float:
+        host = host or ""
+        if not host:
+            return 0.0
+        # Compare against host and sld specifically
+        best = 0.0
+        sld_local = host.split(".")[-2] if "." in host else host
+        for brand in common_brands:
+            best = max(
+                best,
+                SequenceMatcher(None, host, brand).ratio(),
+                SequenceMatcher(None, sld_local, brand).ratio(),
+            )
+        return float(best)
+    def _like_brand(host: str, brand: str, threshold: float = 0.82) -> int:
+        h = host or ""
+        if not h:
+            return 0
+        if brand in h:
+            return 1
+        sld_local = h.split(".")[-2] if "." in h else h
+        score = max(
+            SequenceMatcher(None, h, brand).ratio(),
+            SequenceMatcher(None, sld_local, brand).ratio(),
+        )
+        return 1 if score >= threshold else 0
+    out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
+    out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
+    # Return columns in the exact order expected by the model; fill any
+    # still-missing engineered columns with zeros to stay robust across
+    # model updates.
+    return out.reindex(columns=feature_cols, fill_value=0)
 def _load_url_model():