Spaces:

Perth0603
/

Random-Forest-Model-for-PhishingDetection

Sleeping

App Files Files Community

Perth0603 commited on Oct 4, 2025

Commit

e2e3793

verified ·

1 Parent(s): cedbf8c

Upload 4 files

Browse files

Files changed (3) hide show

Dockerfile +0 -4
README.md +0 -31
app.py +60 -141

Dockerfile CHANGED Viewed

@@ -20,11 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 COPY requirements.txt /app/requirements.txt
 RUN pip install -r /app/requirements.txt
-# App code and data files
 COPY app.py /app/app.py
-COPY autocalib_phishy.csv /app/autocalib_phishy.csv
-COPY autocalib_legit.csv /app/autocalib_legit.csv
-COPY known_hosts.csv /app/known_hosts.csv
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY requirements.txt /app/requirements.txt
 RUN pip install -r /app/requirements.txt
 COPY app.py /app/app.py
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -13,7 +13,6 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
   - `phishing_probability` is always the raw probability of phishing (0..1)
   - `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
   - `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
-  - Also includes `predicted_label` (0→LEGIT, 1→PHISH) aligned to dataset polarity, and `raw_proba_class1` for debugging
 ## Files
 - Dockerfile - builds a small FastAPI server image
@@ -27,9 +26,6 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
    - MODEL_ID = Perth0603/phishing-email-mobilebert
    - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
    - URL_FILENAME = url_rf_model.joblib  (set to your artifact filename)
-   - Alternatively use: HF_URL_MODEL_ID, HF_URL_REPO_TYPE, HF_URL_FILENAME
-   - Optional: AUTOCALIB_PHISHY_CSV, AUTOCALIB_LEGIT_CSV, KNOWN_HOSTS_CSV
-   - Optional: URL_POSITIVE_CLASS (PHISH or LEGIT)
 4. Wait for the Space to build and become green. Test:
    - GET `/` should return `{ status: ok, model: ... }`
    - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
@@ -46,30 +42,3 @@ Run the app:
 ```
 flutter run --dart-define-from-file=hf.env.json
 ```
-## CSV configuration
-You can provide CSV files to customize autocalibration URLs and known host overrides.
-Formats:
-```
-# autocalib_phishy.csv
-url
-http://198.51.100.23/login/update?acc=123
-http://secure-login-account-update.example.com/session?id=123
-```
-```
-# autocalib_legit.csv
-url
-https://www.wikipedia.org/
-https://www.python.org/
-```
-```
-# known_hosts.csv
-host,label
-cjplogger.com,LEGIT
-bad-login-update.example.com,PHISH
-```

   - `phishing_probability` is always the raw probability of phishing (0..1)
   - `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
   - `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
 ## Files
 - Dockerfile - builds a small FastAPI server image
    - MODEL_ID = Perth0603/phishing-email-mobilebert
    - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
    - URL_FILENAME = url_rf_model.joblib  (set to your artifact filename)
 4. Wait for the Space to build and become green. Test:
    - GET `/` should return `{ status: ok, model: ... }`
    - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
 ```
 flutter run --dart-define-from-file=hf.env.json
 ```

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
 os.environ.setdefault("TORCH_HOME", "/data/.cache")
 from typing import Optional, List, Dict, Any
-import csv
 from urllib.parse import urlparse
 import threading
 import re
@@ -73,113 +72,64 @@ _url_phish_is_positive: Optional[bool] = None
 # -------------------------
 # You can edit these lists to define which URLs are considered obviously phishy/legit
 # for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
-# Loaded from CSV. Provide via AUTOCALIB_PHISHY_CSV or hf_space/autocalib_phishy.csv
-_AUTOCALIB_PHISHY_URLS: List[str] = []
-# Loaded from CSV. Provide via AUTOCALIB_LEGIT_CSV or hf_space/autocalib_legit.csv
-_AUTOCALIB_LEGIT_URLS: List[str] = []
-# Known host overrides (CSV-driven): hf_space/known_hosts.csv or KNOWN_HOSTS_CSV
-_KNOWN_LEGIT_HOSTS: List[str] = []
-_KNOWN_PHISH_HOSTS: List[str] = []
-def _normalize_host(value: str) -> str:
-    v = value.strip().lower()
-    if v.startswith("www."):
-        v = v[4:]
-    return v
-def _host_matches_any(host: str, known: List[str]) -> bool:
-    base = _normalize_host(host)
-    for item in known:
-        k = _normalize_host(item)
-        if base == k or base.endswith("." + k):
-            return True
-    return False
-# -------------------------
-# CSV configuration support (optional)
-# -------------------------
-def _read_urls_from_csv(path: str) -> List[str]:
-    urls: List[str] = []
-    try:
-        with open(path, newline="", encoding="utf-8") as f:
-            reader = csv.DictReader(f)
-            if "url" in (reader.fieldnames or []):
-                for row in reader:
-                    val = str(row.get("url", "")).strip()
-                    if val:
-                        urls.append(val)
-            else:
-                f.seek(0)
-                f2 = csv.reader(f)
-                for row in f2:
-                    if not row:
-                        continue
-                    val = str(row[0]).strip()
-                    if val.lower() == "url":
-                        continue
-                    if val:
-                        urls.append(val)
-    except Exception as e:
-        print(f"[csv] failed reading URLs from {path}: {e}")
-    return urls
-def _read_hosts_from_csv(path: str) -> Dict[str, str]:
-    host_to_label: Dict[str, str] = {}
-    try:
-        with open(path, newline="", encoding="utf-8") as f:
-            reader = csv.DictReader(f)
-            fields = [x.lower() for x in (reader.fieldnames or [])]
-            if "host" in fields and "label" in fields:
-                for row in reader:
-                    host = str(row.get("host", "")).strip().lower()
-                    label = str(row.get("label", "")).strip().upper()
-                    if host and label in ("PHISH", "LEGIT"):
-                        host_to_label[host] = label
-            else:
-                f.seek(0)
-                f2 = csv.reader(f)
-                for row in f2:
-                    if len(row) < 2:
-                        continue
-                    host = str(row[0]).strip().lower()
-                    label = str(row[1]).strip().upper()
-                    if host.lower() == "host" and label == "LABEL":
-                        continue
-                    if host and label in ("PHISH", "LEGIT"):
-                        host_to_label[host] = label
-    except Exception as e:
-        print(f"[csv] failed reading hosts from {path}: {e}")
-    return host_to_label
-def _load_csv_configs_if_any():
-    base_dir = os.path.dirname(__file__)
-    phishy_csv = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(base_dir, "autocalib_phishy.csv"))
-    legit_csv = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(base_dir, "autocalib_legit.csv"))
-    hosts_csv = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(base_dir, "known_hosts.csv"))
-    if os.path.exists(phishy_csv):
-        urls = _read_urls_from_csv(phishy_csv)
-        if urls:
-            print(f"[csv] loaded phishy URLs: {len(urls)} from {phishy_csv}")
-            _AUTOCALIB_PHISHY_URLS[:] = urls
-    if os.path.exists(legit_csv):
-        urls = _read_urls_from_csv(legit_csv)
-        if urls:
-            print(f"[csv] loaded legit URLs: {len(urls)} from {legit_csv}")
-            _AUTOCALIB_LEGIT_URLS[:] = urls
-    if os.path.exists(hosts_csv):
-        mapping = _read_hosts_from_csv(hosts_csv)
-        if mapping:
-            print(f"[csv] loaded known hosts: {len(mapping)} from {hosts_csv}")
-            _KNOWN_LEGIT_HOSTS.clear()
-            _KNOWN_PHISH_HOSTS.clear()
-            for host, label in mapping.items():
-                if label == "LEGIT":
-                    _KNOWN_LEGIT_HOSTS.append(host)
-                elif label == "PHISH":
-                    _KNOWN_PHISH_HOSTS.append(host)
 # -------------------------
 # URL features (must match training)
@@ -279,21 +229,6 @@ def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[st
     phishy = _AUTOCALIB_PHISHY_URLS
     legit = _AUTOCALIB_LEGIT_URLS
-    # Guard: if CSVs are empty, fall back to safe defaults
-    if not phishy:
-        phishy = [
-            "http://198.51.100.23/login/update?acc=123",
-            "http://secure-login-account-update.example.com/session?id=123",
-            "http://bank.verify-update-security.com/confirm",
-            "http://paypal.com.account-verify.cn/login",
-        ]
-    if not legit:
-        legit = [
-            "https://www.wikipedia.org/",
-            "https://www.python.org/",
-            "https://www.microsoft.com/",
-            "https://www.openai.com/",
-        ]
     model = bundle.get("model")
     model_type: str = str(bundle.get("model_type") or "")
@@ -359,8 +294,6 @@ def _startup():
         print(f"[startup] text model load failed: {e}")
     try:
         _load_url_model()
-        # Load CSV-based config if present
-        _load_csv_configs_if_any()
         global _url_phish_is_positive
         b = _url_bundle
         if isinstance(b, dict) and _url_phish_is_positive is None:
@@ -380,20 +313,6 @@ def _startup():
 def root():
     return {"status": "ok", "model": MODEL_ID}
-@app.get("/debug-config")
-def debug_config():
-    return {
-        "phishy_count": len(_AUTOCALIB_PHISHY_URLS),
-        "legit_count": len(_AUTOCALIB_LEGIT_URLS),
-        "known_legit_hosts": _KNOWN_LEGIT_HOSTS[:50],
-        "known_phish_hosts": _KNOWN_PHISH_HOSTS[:50],
-        "url_repo": URL_REPO,
-        "url_repo_type": URL_REPO_TYPE,
-        "url_filename": URL_FILENAME,
-        "phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
-        "resolved_phish_is_positive": _url_phish_is_positive,
-    }
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
@@ -468,9 +387,9 @@ def predict_url(payload: PredictUrlPayload):
         host = (urlparse(url_str).hostname or "").lower()
         if host:
             override_label: Optional[str] = None
-            if _host_matches_any(host, _KNOWN_LEGIT_HOSTS):
                 override_label = "LEGIT"
-            elif _host_matches_any(host, _KNOWN_PHISH_HOSTS):
                 override_label = "PHISH"
             if override_label is not None:
                 # Map numeric label according to resolved polarity

 os.environ.setdefault("TORCH_HOME", "/data/.cache")
 from typing import Optional, List, Dict, Any
 from urllib.parse import urlparse
 import threading
 import re
 # -------------------------
 # You can edit these lists to define which URLs are considered obviously phishy/legit
 # for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
+_AUTOCALIB_PHISHY_URLS: List[str] = [
+    "http://198.51.100.23/login/update?acc=123",
+    "http://secure-login-account-update.example.com/session?id=123",
+    "http://bank.verify-update-security.com/confirm",
+    "http://paypal.com.account-verify.cn/login",
+    "http://abc.xyz/downloads/invoice.exe",
+    "http://203.0.113.45/verify/account",
+    "http://login-secure-update.example.net/confirm",
+    "http://paypal.com.verify.bill.cn/login",
+    "http://account-update-security-pay.example.org/verify",
+    "http://secure-login-microsoft.example.info/reset",
+    "http://login.verify-paypal.support-id.example.com/",
+    "http://dropbox.com.security-alert.example.net/login",
+    "http://bankofamerica.secure-update.example.co/verify",
+    "http://icloud.apple.com.signin.security-alert.example.co/login",
+    "http://google.com.accounts.security-check.example.xyz/signin",
+    "http://update-billing-info.example-downloads.com/invoice.zip",
+    "http://download.secure-update.example.com/app.exe",
+    "http://192.0.2.10/secure/login",
+    "http://198.51.100.50/account/verify?session=abc",
+    "http://example.com@evil.com/login",
+    "http://secure.example.com-login.verify.co/reset",
+    "http://support-paypal.example.co.uk.refund.cn/login",
+    "http://microsoft.account-security.example.ru/update",
+    "http://amazon.verify-order.example.top/confirm",
+    "http://webscr.paypal.example.phish/login",
+]
+_AUTOCALIB_LEGIT_URLS: List[str] = [
+    "https://www.wikipedia.org/",
+    "https://www.microsoft.com/",
+    "https://www.openai.com/",
+    "https://www.python.org/",
+    "https://www.gov.uk/",
+    "https://www.google.com/",
+    "https://www.apple.com/",
+    "https://github.com/",
+    "https://stackoverflow.com/",
+    "https://www.bbc.com/",
+    "https://www.nytimes.com/",
+    "https://www.nasa.gov/",
+    "https://www.mozilla.org/",
+    "https://www.cloudflare.com/",
+    "https://www.reddit.com/",
+    "https://www.linkedin.com/",
+    "https://www.youtube.com/",
+    "https://developer.apple.com/",
+    "https://aws.amazon.com/",
+    "https://azure.microsoft.com/",
+]
+# Known host overrides (editable): force certain domains as LEGIT or PHISH
+_KNOWN_LEGIT_HOSTS: List[str] = [
+    "cjplogger.com",
+    "www.cjplogger.com",
+]
+_KNOWN_PHISH_HOSTS: List[str] = [
+]
 # -------------------------
 # URL features (must match training)
     phishy = _AUTOCALIB_PHISHY_URLS
     legit = _AUTOCALIB_LEGIT_URLS
     model = bundle.get("model")
     model_type: str = str(bundle.get("model_type") or "")
         print(f"[startup] text model load failed: {e}")
     try:
         _load_url_model()
         global _url_phish_is_positive
         b = _url_bundle
         if isinstance(b, dict) and _url_phish_is_positive is None:
 def root():
     return {"status": "ok", "model": MODEL_ID}
 @app.post("/predict")
 def predict(payload: PredictPayload):
     try:
         host = (urlparse(url_str).hostname or "").lower()
         if host:
             override_label: Optional[str] = None
+            if host in _KNOWN_LEGIT_HOSTS:
                 override_label = "LEGIT"
+            elif host in _KNOWN_PHISH_HOSTS:
                 override_label = "PHISH"
             if override_label is not None:
                 # Map numeric label according to resolved polarity