Upload 7 files
Browse files- README.md +31 -0
- app.py +111 -23
- autocalib_legit.csv +27 -0
- autocalib_phishy.csv +19 -0
- known_hosts.csv +14 -0
README.md
CHANGED
|
@@ -13,6 +13,7 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
|
|
| 13 |
- `phishing_probability` is always the raw probability of phishing (0..1)
|
| 14 |
- `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
|
| 15 |
- `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
|
|
|
|
| 16 |
|
| 17 |
## Files
|
| 18 |
- Dockerfile - builds a small FastAPI server image
|
|
@@ -26,6 +27,9 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
|
|
| 26 |
- MODEL_ID = Perth0603/phishing-email-mobilebert
|
| 27 |
- URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
|
| 28 |
- URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
|
|
|
|
|
|
|
|
|
|
| 29 |
4. Wait for the Space to build and become green. Test:
|
| 30 |
- GET `/` should return `{ status: ok, model: ... }`
|
| 31 |
- POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
|
|
@@ -42,3 +46,30 @@ Run the app:
|
|
| 42 |
```
|
| 43 |
flutter run --dart-define-from-file=hf.env.json
|
| 44 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
- `phishing_probability` is always the raw probability of phishing (0..1)
|
| 14 |
- `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
|
| 15 |
- `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
|
| 16 |
+
- Also includes `predicted_label` (0→LEGIT, 1→PHISH) aligned to dataset polarity, and `raw_proba_class1` for debugging
|
| 17 |
|
| 18 |
## Files
|
| 19 |
- Dockerfile - builds a small FastAPI server image
|
|
|
|
| 27 |
- MODEL_ID = Perth0603/phishing-email-mobilebert
|
| 28 |
- URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
|
| 29 |
- URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
|
| 30 |
+
- Alternatively use: HF_URL_MODEL_ID, HF_URL_REPO_TYPE, HF_URL_FILENAME
|
| 31 |
+
- Optional: AUTOCALIB_PHISHY_CSV, AUTOCALIB_LEGIT_CSV, KNOWN_HOSTS_CSV
|
| 32 |
+
- Optional: URL_POSITIVE_CLASS (PHISH or LEGIT)
|
| 33 |
4. Wait for the Space to build and become green. Test:
|
| 34 |
- GET `/` should return `{ status: ok, model: ... }`
|
| 35 |
- POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
|
|
|
|
| 46 |
```
|
| 47 |
flutter run --dart-define-from-file=hf.env.json
|
| 48 |
```
|
| 49 |
+
|
| 50 |
+
## CSV configuration
|
| 51 |
+
|
| 52 |
+
You can provide CSV files to customize autocalibration URLs and known host overrides.
|
| 53 |
+
|
| 54 |
+
Formats:
|
| 55 |
+
|
| 56 |
+
```
|
| 57 |
+
# autocalib_phishy.csv
|
| 58 |
+
url
|
| 59 |
+
http://198.51.100.23/login/update?acc=123
|
| 60 |
+
http://secure-login-account-update.example.com/session?id=123
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
```
|
| 64 |
+
# autocalib_legit.csv
|
| 65 |
+
url
|
| 66 |
+
https://www.wikipedia.org/
|
| 67 |
+
https://www.python.org/
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
```
|
| 71 |
+
# known_hosts.csv
|
| 72 |
+
host,label
|
| 73 |
+
cjplogger.com,LEGIT
|
| 74 |
+
bad-login-update.example.com,PHISH
|
| 75 |
+
```
|
app.py
CHANGED
|
@@ -6,6 +6,7 @@ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
|
|
| 6 |
os.environ.setdefault("TORCH_HOME", "/data/.cache")
|
| 7 |
|
| 8 |
from typing import Optional, List, Dict, Any
|
|
|
|
| 9 |
from urllib.parse import urlparse
|
| 10 |
import threading
|
| 11 |
import re
|
|
@@ -72,29 +73,99 @@ _url_phish_is_positive: Optional[bool] = None
|
|
| 72 |
# -------------------------
|
| 73 |
# You can edit these lists to define which URLs are considered obviously phishy/legit
|
| 74 |
# for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
]
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
# -------------------------
|
| 100 |
# URL features (must match training)
|
|
@@ -194,6 +265,21 @@ def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[st
|
|
| 194 |
|
| 195 |
phishy = _AUTOCALIB_PHISHY_URLS
|
| 196 |
legit = _AUTOCALIB_LEGIT_URLS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
model = bundle.get("model")
|
| 199 |
model_type: str = str(bundle.get("model_type") or "")
|
|
@@ -259,6 +345,8 @@ def _startup():
|
|
| 259 |
print(f"[startup] text model load failed: {e}")
|
| 260 |
try:
|
| 261 |
_load_url_model()
|
|
|
|
|
|
|
| 262 |
global _url_phish_is_positive
|
| 263 |
b = _url_bundle
|
| 264 |
if isinstance(b, dict) and _url_phish_is_positive is None:
|
|
|
|
| 6 |
os.environ.setdefault("TORCH_HOME", "/data/.cache")
|
| 7 |
|
| 8 |
from typing import Optional, List, Dict, Any
|
| 9 |
+
import csv
|
| 10 |
from urllib.parse import urlparse
|
| 11 |
import threading
|
| 12 |
import re
|
|
|
|
| 73 |
# -------------------------
|
| 74 |
# You can edit these lists to define which URLs are considered obviously phishy/legit
|
| 75 |
# for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
|
| 76 |
+
# Loaded from CSV. Provide via AUTOCALIB_PHISHY_CSV or hf_space/autocalib_phishy.csv
|
| 77 |
+
_AUTOCALIB_PHISHY_URLS: List[str] = []
|
| 78 |
+
|
| 79 |
+
# Loaded from CSV. Provide via AUTOCALIB_LEGIT_CSV or hf_space/autocalib_legit.csv
|
| 80 |
+
_AUTOCALIB_LEGIT_URLS: List[str] = []
|
| 81 |
+
|
| 82 |
+
# Known host overrides (CSV-driven): hf_space/known_hosts.csv or KNOWN_HOSTS_CSV
|
| 83 |
+
_KNOWN_LEGIT_HOSTS: List[str] = []
|
| 84 |
+
_KNOWN_PHISH_HOSTS: List[str] = []
|
| 85 |
+
|
| 86 |
+
# -------------------------
|
| 87 |
+
# CSV configuration support (optional)
|
| 88 |
+
# -------------------------
|
| 89 |
+
def _read_urls_from_csv(path: str) -> List[str]:
|
| 90 |
+
urls: List[str] = []
|
| 91 |
+
try:
|
| 92 |
+
with open(path, newline="", encoding="utf-8") as f:
|
| 93 |
+
reader = csv.DictReader(f)
|
| 94 |
+
if "url" in (reader.fieldnames or []):
|
| 95 |
+
for row in reader:
|
| 96 |
+
val = str(row.get("url", "")).strip()
|
| 97 |
+
if val:
|
| 98 |
+
urls.append(val)
|
| 99 |
+
else:
|
| 100 |
+
f.seek(0)
|
| 101 |
+
f2 = csv.reader(f)
|
| 102 |
+
for row in f2:
|
| 103 |
+
if not row:
|
| 104 |
+
continue
|
| 105 |
+
val = str(row[0]).strip()
|
| 106 |
+
if val.lower() == "url":
|
| 107 |
+
continue
|
| 108 |
+
if val:
|
| 109 |
+
urls.append(val)
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"[csv] failed reading URLs from {path}: {e}")
|
| 112 |
+
return urls
|
| 113 |
+
|
| 114 |
+
def _read_hosts_from_csv(path: str) -> Dict[str, str]:
|
| 115 |
+
host_to_label: Dict[str, str] = {}
|
| 116 |
+
try:
|
| 117 |
+
with open(path, newline="", encoding="utf-8") as f:
|
| 118 |
+
reader = csv.DictReader(f)
|
| 119 |
+
fields = [x.lower() for x in (reader.fieldnames or [])]
|
| 120 |
+
if "host" in fields and "label" in fields:
|
| 121 |
+
for row in reader:
|
| 122 |
+
host = str(row.get("host", "")).strip().lower()
|
| 123 |
+
label = str(row.get("label", "")).strip().upper()
|
| 124 |
+
if host and label in ("PHISH", "LEGIT"):
|
| 125 |
+
host_to_label[host] = label
|
| 126 |
+
else:
|
| 127 |
+
f.seek(0)
|
| 128 |
+
f2 = csv.reader(f)
|
| 129 |
+
for row in f2:
|
| 130 |
+
if len(row) < 2:
|
| 131 |
+
continue
|
| 132 |
+
host = str(row[0]).strip().lower()
|
| 133 |
+
label = str(row[1]).strip().upper()
|
| 134 |
+
if host.lower() == "host" and label == "LABEL":
|
| 135 |
+
continue
|
| 136 |
+
if host and label in ("PHISH", "LEGIT"):
|
| 137 |
+
host_to_label[host] = label
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"[csv] failed reading hosts from {path}: {e}")
|
| 140 |
+
return host_to_label
|
| 141 |
+
|
| 142 |
+
def _load_csv_configs_if_any():
|
| 143 |
+
base_dir = os.path.dirname(__file__)
|
| 144 |
+
phishy_csv = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(base_dir, "autocalib_phishy.csv"))
|
| 145 |
+
legit_csv = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(base_dir, "autocalib_legit.csv"))
|
| 146 |
+
hosts_csv = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(base_dir, "known_hosts.csv"))
|
| 147 |
+
|
| 148 |
+
if os.path.exists(phishy_csv):
|
| 149 |
+
urls = _read_urls_from_csv(phishy_csv)
|
| 150 |
+
if urls:
|
| 151 |
+
print(f"[csv] loaded phishy URLs: {len(urls)} from {phishy_csv}")
|
| 152 |
+
_AUTOCALIB_PHISHY_URLS[:] = urls
|
| 153 |
+
if os.path.exists(legit_csv):
|
| 154 |
+
urls = _read_urls_from_csv(legit_csv)
|
| 155 |
+
if urls:
|
| 156 |
+
print(f"[csv] loaded legit URLs: {len(urls)} from {legit_csv}")
|
| 157 |
+
_AUTOCALIB_LEGIT_URLS[:] = urls
|
| 158 |
+
if os.path.exists(hosts_csv):
|
| 159 |
+
mapping = _read_hosts_from_csv(hosts_csv)
|
| 160 |
+
if mapping:
|
| 161 |
+
print(f"[csv] loaded known hosts: {len(mapping)} from {hosts_csv}")
|
| 162 |
+
_KNOWN_LEGIT_HOSTS.clear()
|
| 163 |
+
_KNOWN_PHISH_HOSTS.clear()
|
| 164 |
+
for host, label in mapping.items():
|
| 165 |
+
if label == "LEGIT":
|
| 166 |
+
_KNOWN_LEGIT_HOSTS.append(host)
|
| 167 |
+
elif label == "PHISH":
|
| 168 |
+
_KNOWN_PHISH_HOSTS.append(host)
|
| 169 |
|
| 170 |
# -------------------------
|
| 171 |
# URL features (must match training)
|
|
|
|
| 265 |
|
| 266 |
phishy = _AUTOCALIB_PHISHY_URLS
|
| 267 |
legit = _AUTOCALIB_LEGIT_URLS
|
| 268 |
+
# Guard: if CSVs are empty, fall back to safe defaults
|
| 269 |
+
if not phishy:
|
| 270 |
+
phishy = [
|
| 271 |
+
"http://198.51.100.23/login/update?acc=123",
|
| 272 |
+
"http://secure-login-account-update.example.com/session?id=123",
|
| 273 |
+
"http://bank.verify-update-security.com/confirm",
|
| 274 |
+
"http://paypal.com.account-verify.cn/login",
|
| 275 |
+
]
|
| 276 |
+
if not legit:
|
| 277 |
+
legit = [
|
| 278 |
+
"https://www.wikipedia.org/",
|
| 279 |
+
"https://www.python.org/",
|
| 280 |
+
"https://www.microsoft.com/",
|
| 281 |
+
"https://www.openai.com/",
|
| 282 |
+
]
|
| 283 |
|
| 284 |
model = bundle.get("model")
|
| 285 |
model_type: str = str(bundle.get("model_type") or "")
|
|
|
|
| 345 |
print(f"[startup] text model load failed: {e}")
|
| 346 |
try:
|
| 347 |
_load_url_model()
|
| 348 |
+
# Load CSV-based config if present
|
| 349 |
+
_load_csv_configs_if_any()
|
| 350 |
global _url_phish_is_positive
|
| 351 |
b = _url_bundle
|
| 352 |
if isinstance(b, dict) and _url_phish_is_positive is None:
|
autocalib_legit.csv
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
url
|
| 2 |
+
https://www.wikipedia.org/
|
| 3 |
+
https://www.microsoft.com/
|
| 4 |
+
https://www.openai.com/
|
| 5 |
+
https://www.python.org/
|
| 6 |
+
https://www.gov.uk/
|
| 7 |
+
https://www.google.com/
|
| 8 |
+
https://www.apple.com/
|
| 9 |
+
https://www.amazon.com/
|
| 10 |
+
https://www.github.com/
|
| 11 |
+
https://stackoverflow.com/
|
| 12 |
+
https://www.nytimes.com/
|
| 13 |
+
https://www.bbc.com/
|
| 14 |
+
https://www.cnn.com/
|
| 15 |
+
https://www.gov.sg/
|
| 16 |
+
https://www.whitehouse.gov/
|
| 17 |
+
https://www.europa.eu/
|
| 18 |
+
https://www.cloudflare.com/
|
| 19 |
+
https://www.dropbox.com/
|
| 20 |
+
https://drive.google.com/
|
| 21 |
+
https://www.paypal.com/
|
| 22 |
+
https://www.facebook.com/
|
| 23 |
+
https://www.linkedin.com/
|
| 24 |
+
https://www.youtube.com/
|
| 25 |
+
https://www.reddit.com/
|
| 26 |
+
http://www.cjplogger.com/
|
| 27 |
+
|
autocalib_phishy.csv
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
url
|
| 2 |
+
http://198.51.100.23/login/update?acc=123
|
| 3 |
+
http://secure-login-account-update.example.com/session?id=123
|
| 4 |
+
http://bank.verify-update-security.com/confirm
|
| 5 |
+
http://paypal.com.account-verify.cn/login
|
| 6 |
+
http://abc.xyz/downloads/invoice.exe
|
| 7 |
+
http://update-login-security-paypal.com/verify
|
| 8 |
+
http://login-secure-paypa1.com/
|
| 9 |
+
http://verify-account-bankof-usa.example.co/reset
|
| 10 |
+
http://support.microsoft.com.example.net/reset-password
|
| 11 |
+
http://secure.appleid.apple.com.example.co/login
|
| 12 |
+
http://drive-google-com.example.org/share/document?id=123
|
| 13 |
+
http://198.51.100.45/pay/confirm?trx=9988
|
| 14 |
+
http://203.0.113.10/parcel/tracking/update
|
| 15 |
+
http://signin-amazon.example.tk/refund
|
| 16 |
+
http://security-update-facebook.example.in/login
|
| 17 |
+
http://login-secure-outlook.example.biz/
|
| 18 |
+
http://dropbox-login.example.co/downloads/setup.zip
|
| 19 |
+
|
known_hosts.csv
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
host,label
|
| 2 |
+
cjplogger.com,LEGIT
|
| 3 |
+
www.cjplogger.com,LEGIT
|
| 4 |
+
wikipedia.org,LEGIT
|
| 5 |
+
www.wikipedia.org,LEGIT
|
| 6 |
+
microsoft.com,LEGIT
|
| 7 |
+
www.microsoft.com,LEGIT
|
| 8 |
+
google.com,LEGIT
|
| 9 |
+
www.google.com,LEGIT
|
| 10 |
+
github.com,LEGIT
|
| 11 |
+
www.github.com,LEGIT
|
| 12 |
+
python.org,LEGIT
|
| 13 |
+
www.python.org,LEGIT
|
| 14 |
+
|