Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Nov 8, 2025

Commit

2c14a34

verified ·

1 Parent(s): d8f11da

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -56

app.py CHANGED Viewed

@@ -140,10 +140,6 @@ def _read_hosts_from_csv(path: str) -> Dict[str, str]:
 def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
-    """
-    MODULE 4: URL Analyzer - Feature Engineering
-    Analyzes URL construction, domain composition, and critical components
-    """
     s = pd.Series(urls, dtype=str)
     out = pd.DataFrame()
@@ -185,7 +181,7 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
     }
     out["tld_suspicious"] = tld_series.apply(lambda t: 1 if t.lower() in suspicious_tlds else 0)
-    # Punycode indicator (internationalized domain names - often used in homoglyph attacks)
     out["has_punycode"] = hosts.str.contains("xn--").astype(int)
     # SLD stats
@@ -252,10 +248,11 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
     out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
     # Lookalike/homoglyph detection: unusual Unicode symbols that resemble ASCII letters
     def _detect_lookalike_chars(url: str) -> int:
         """
         Detects if URL contains Unicode characters that visually resemble ASCII letters.
-        Common lookalikes used in phishing homoglyph attacks:
         - Cyrillic: а, е, о, р, с, х, у, ч, ы, ь (look like a,e,o,p,c,x,y,4,b,b)
         - Greek: α, ο (look like a, o)
         - Latin Extended: ɑ, ɢ, ᴅ, ɡ, ɪ, ɴ, ɪ (look like a,G,D,g,i,N,I)
@@ -319,20 +316,11 @@ def _normalize_url_string(url: str) -> str:
 @app.get("/")
 def root():
-    return {
-        "status": "ok",
-        "service": "PhishWatch Pro - Module 4: URL Analyzer",
-        "backend": "Random Forest (GPU accelerated)"
-    }
 @app.post("/predict-url")
 def predict_url(payload: PredictUrlPayload):
-    """
-    MODULE 4: URL Analyzer
-    Analyzes URL construction, domain composition, and critical components
-    Returns phishing risk score with confidence level and threat type
-    """
     try:
         _load_url_model()
@@ -374,8 +362,7 @@ def predict_url(payload: PredictUrlPayload):
                 "backend": str(model_type),
                 "threshold": 0.5,
                 "url_col": url_col,
-                "override": {"reason": "csv_url_match", "module": "4_url_analyzer"},
-                "threat_type": "known_phishing_url" if label == "PHISH" else "known_safe",
             }
         # Known-host override (suffix match)
@@ -396,22 +383,23 @@ def predict_url(payload: PredictUrlPayload):
                         "backend": str(model_type),
                         "threshold": 0.5,
                         "url_col": url_col,
-                        "override": {"reason": "known_host_match", "module": "4_url_analyzer"},
-                        "threat_type": "known_phishing_domain" if label == "PHISH" else "known_safe",
                     }
-        # Lookalike character guard: detect homoglyph/lookalike attacks (heuristic indicator)
         try:
             lookalikes_cyrillic = {
                 'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
                 'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
                 'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
             }
             lookalikes_greek = {
                 'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
             }
             lookalikes_latin = {
                 'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
                 'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
@@ -431,17 +419,15 @@ def predict_url(payload: PredictUrlPayload):
                         "predicted_label": int(predicted_label),
                         "score": float(score),
                         "phishing_probability": float(phish_proba),
-                        "backend": "homoglyph_guard",
                         "threshold": 0.5,
                         "url_col": url_col,
-                        "rule": "homoglyph_character_detected",
-                        "threat_type": "homoglyph_attack",
-                        "module": "4_url_analyzer_heuristic",
                     }
         except Exception:
             pass
-        # Typosquat guard: detect brand impersonation with typos (heuristic indicator)
         try:
             s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
             s_sld = s_host.split(".")[-2] if "." in s_host else s_host
@@ -480,14 +466,12 @@ def predict_url(payload: PredictUrlPayload):
                         "backend": "typosquat_guard",
                         "threshold": 0.5,
                         "url_col": url_col,
-                        "rule": "typosquat_detected",
-                        "threat_type": "brand_impersonation",
-                        "module": "4_url_analyzer_heuristic",
                     }
         except Exception:
             pass
-        # Random Forest Model Inference (primary detection)
         feats = _engineer_features([url_str], feature_cols)
         if model_type == "xgboost_bst":
             if xgb is None:
@@ -508,22 +492,6 @@ def predict_url(payload: PredictUrlPayload):
         predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
         score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
-        # Determine threat type based on features
-        threat_type = "unknown"
-        if label == "PHISH":
-            if feats["has_ip"].iloc[0] == 1:
-                threat_type = "ip_based_phishing"
-            elif feats["has_lookalike_chars"].iloc[0] == 1:
-                threat_type = "homoglyph_phishing"
-            elif feats["subdomain_count"].iloc[0] > 3:
-                threat_type = "subdomain_abuse"
-            elif feats["tld_suspicious"].iloc[0] == 1:
-                threat_type = "suspicious_tld"
-            elif any(feats[f"has_{tok}"].iloc[0] == 1 for tok in ["login", "verify", "secure", "bank", "pay"]):
-                threat_type = "phishing_lure"
-            else:
-                threat_type = "anomalous_url_structure"
         return {
             "label": label,
             "predicted_label": int(predicted_label),
@@ -532,15 +500,8 @@ def predict_url(payload: PredictUrlPayload):
             "backend": str(model_type),
             "threshold": 0.5,
             "url_col": url_col,
-            "threat_type": threat_type,
-            "module": "4_url_analyzer_random_forest",
-            "features": {
-                "url_length": float(feats["url_len"].iloc[0]),
-                "subdomain_count": float(feats["subdomain_count"].iloc[0]),
-                "has_ip": bool(feats["has_ip"].iloc[0]),
-                "suspicious_tld": bool(feats["tld_suspicious"].iloc[0]),
-                "has_punycode": bool(feats["has_punycode"].iloc[0]),
-            }
         }
     except Exception as e:
-        return JSONResponse(status_code=500, content={"error": str(e)})

 def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
     s = pd.Series(urls, dtype=str)
     out = pd.DataFrame()
     }
     out["tld_suspicious"] = tld_series.apply(lambda t: 1 if t.lower() in suspicious_tlds else 0)
+    # Punycode indicator
     out["has_punycode"] = hosts.str.contains("xn--").astype(int)
     # SLD stats
     out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
     # Lookalike/homoglyph detection: unusual Unicode symbols that resemble ASCII letters
+    # Examples: Cyrillic а (U+0430) looks like 'a', Greek α (U+03B1) looks like 'a', etc.
     def _detect_lookalike_chars(url: str) -> int:
         """
         Detects if URL contains Unicode characters that visually resemble ASCII letters.
+        Common lookalikes used in phishing:
         - Cyrillic: а, е, о, р, с, х, у, ч, ы, ь (look like a,e,o,p,c,x,y,4,b,b)
         - Greek: α, ο (look like a, o)
         - Latin Extended: ɑ, ɢ, ᴅ, ɡ, ɪ, ɴ, ɪ (look like a,G,D,g,i,N,I)
 @app.get("/")
 def root():
+    return {"status": "ok", "backend": "url-only"}
 @app.post("/predict-url")
 def predict_url(payload: PredictUrlPayload):
     try:
         _load_url_model()
                 "backend": str(model_type),
                 "threshold": 0.5,
                 "url_col": url_col,
+                "override": {"reason": "csv_url_match"},
             }
         # Known-host override (suffix match)
                         "backend": str(model_type),
                         "threshold": 0.5,
                         "url_col": url_col,
                     }
+        # Lookalike character guard: detect homoglyph/lookalike attacks
         try:
+            # Cyrillic characters that look like ASCII letters
             lookalikes_cyrillic = {
                 'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
                 'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
                 'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
             }
+            # Greek characters that look like ASCII letters
             lookalikes_greek = {
                 'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
             }
+            # Latin Extended lookalikes
             lookalikes_latin = {
                 'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
                 'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
                         "predicted_label": int(predicted_label),
                         "score": float(score),
                         "phishing_probability": float(phish_proba),
+                        "backend": "lookalike_guard",
                         "threshold": 0.5,
                         "url_col": url_col,
+                        "rule": "lookalike_character_detected",
                     }
         except Exception:
             pass
+        # Typosquat guard: mirror notebook fallback logic.
         try:
             s_host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
             s_sld = s_host.split(".")[-2] if "." in s_host else s_host
                         "backend": "typosquat_guard",
                         "threshold": 0.5,
                         "url_col": url_col,
+                        "rule": "typosquat_guard",
                     }
         except Exception:
             pass
+        # Mirror inference flow for probability of class 1
         feats = _engineer_features([url_str], feature_cols)
         if model_type == "xgboost_bst":
             if xgb is None:
         predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
         score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
         return {
             "label": label,
             "predicted_label": int(predicted_label),
             "backend": str(model_type),
             "threshold": 0.5,
             "url_col": url_col,
         }
     except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})