Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Oct 4, 2025

Commit

016c0e8

verified ·

1 Parent(s): e2e3793

Upload 4 files

Browse files

Files changed (1) hide show

app.py +102 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
 os.environ.setdefault("TORCH_HOME", "/data/.cache")
 from typing import Optional, List, Dict, Any
 from urllib.parse import urlparse
 import threading
 import re
@@ -131,6 +132,103 @@ _KNOWN_LEGIT_HOSTS: List[str] = [
 _KNOWN_PHISH_HOSTS: List[str] = [
 ]
 # -------------------------
 # URL features (must match training)
 # -------------------------
@@ -294,6 +392,8 @@ def _startup():
         print(f"[startup] text model load failed: {e}")
     try:
         _load_url_model()
         global _url_phish_is_positive
         b = _url_bundle
         if isinstance(b, dict) and _url_phish_is_positive is None:
@@ -343,6 +443,8 @@ def predict(payload: PredictPayload):
 def predict_url(payload: PredictUrlPayload):
     try:
         _load_url_model()
         bundle = _url_bundle
         if not isinstance(bundle, dict) or "model" not in bundle:
             raise RuntimeError("Loaded URL artifact is not a bundle dict with 'model'.")

 os.environ.setdefault("TORCH_HOME", "/data/.cache")
 from typing import Optional, List, Dict, Any
+import csv
 from urllib.parse import urlparse
 import threading
 import re
 _KNOWN_PHISH_HOSTS: List[str] = [
 ]
+# Helpers to normalize and match hosts by suffix (handles subdomains)
+def _normalize_host(value: str) -> str:
+    v = value.strip().lower()
+    if v.startswith("www."):
+        v = v[4:]
+    return v
+def _host_matches_any(host: str, known: List[str]) -> bool:
+    base = _normalize_host(host)
+    for item in known:
+        k = _normalize_host(item)
+        if base == k or base.endswith("." + k):
+            return True
+    return False
+# Optional CSV configuration
+def _read_urls_from_csv(path: str) -> List[str]:
+    urls: List[str] = []
+    try:
+        with open(path, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            if "url" in (reader.fieldnames or []):
+                for row in reader:
+                    val = str(row.get("url", "")).strip()
+                    if val:
+                        urls.append(val)
+            else:
+                f.seek(0)
+                f2 = csv.reader(f)
+                for row in f2:
+                    if not row:
+                        continue
+                    val = str(row[0]).strip()
+                    if val.lower() == "url":
+                        continue
+                    if val:
+                        urls.append(val)
+    except Exception as e:
+        print(f"[csv] failed reading URLs from {path}: {e}")
+    return urls
+def _read_hosts_from_csv(path: str) -> Dict[str, str]:
+    host_to_label: Dict[str, str] = {}
+    try:
+        with open(path, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            fields = [x.lower() for x in (reader.fieldnames or [])]
+            if "host" in fields and "label" in fields:
+                for row in reader:
+                    host = str(row.get("host", "")).strip().lower()
+                    label = str(row.get("label", "")).strip().upper()
+                    if host and label in ("PHISH", "LEGIT"):
+                        host_to_label[host] = label
+            else:
+                f.seek(0)
+                f2 = csv.reader(f)
+                for row in f2:
+                    if len(row) < 2:
+                        continue
+                    host = str(row[0]).strip().lower()
+                    label = str(row[1]).strip().upper()
+                    if host.lower() == "host" and label == "LABEL":
+                        continue
+                    if host and label in ("PHISH", "LEGIT"):
+                        host_to_label[host] = label
+    except Exception as e:
+        print(f"[csv] failed reading hosts from {path}: {e}")
+    return host_to_label
+def _load_csv_configs_if_any():
+    base_dir = os.path.dirname(__file__)
+    phishy_csv = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(base_dir, "autocalib_phishy.csv"))
+    legit_csv = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(base_dir, "autocalib_legit.csv"))
+    hosts_csv = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(base_dir, "known_hosts.csv"))
+    if os.path.exists(phishy_csv):
+        urls = _read_urls_from_csv(phishy_csv)
+        if urls:
+            print(f"[csv] loaded phishy URLs: {len(urls)} from {phishy_csv}")
+            _AUTOCALIB_PHISHY_URLS[:] = urls
+    if os.path.exists(legit_csv):
+        urls = _read_urls_from_csv(legit_csv)
+        if urls:
+            print(f"[csv] loaded legit URLs: {len(urls)} from {legit_csv}")
+            _AUTOCALIB_LEGIT_URLS[:] = urls
+    if os.path.exists(hosts_csv):
+        mapping = _read_hosts_from_csv(hosts_csv)
+        if mapping:
+            print(f"[csv] loaded known hosts: {len(mapping)} from {hosts_csv}")
+            _KNOWN_LEGIT_HOSTS.clear()
+            _KNOWN_PHISH_HOSTS.clear()
+            for host, label in mapping.items():
+                if label == "LEGIT":
+                    _KNOWN_LEGIT_HOSTS.append(host)
+                elif label == "PHISH":
+                    _KNOWN_PHISH_HOSTS.append(host)
 # -------------------------
 # URL features (must match training)
 # -------------------------
         print(f"[startup] text model load failed: {e}")
     try:
         _load_url_model()
+        # Load CSV-driven config if present
+        _load_csv_configs_if_any()
         global _url_phish_is_positive
         b = _url_bundle
         if isinstance(b, dict) and _url_phish_is_positive is None:
 def predict_url(payload: PredictUrlPayload):
     try:
         _load_url_model()
+        # Load CSV-based config if present (hot-reload safe)
+        _load_csv_configs_if_any()
         bundle = _url_bundle
         if not isinstance(bundle, dict) or "model" not in bundle:
             raise RuntimeError("Loaded URL artifact is not a bundle dict with 'model'.")