Upload 4 files
Browse files- Dockerfile +0 -4
- README.md +0 -31
- app.py +60 -141
Dockerfile
CHANGED
|
@@ -20,11 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 20 |
COPY requirements.txt /app/requirements.txt
|
| 21 |
RUN pip install -r /app/requirements.txt
|
| 22 |
|
| 23 |
-
# App code and data files
|
| 24 |
COPY app.py /app/app.py
|
| 25 |
-
COPY autocalib_phishy.csv /app/autocalib_phishy.csv
|
| 26 |
-
COPY autocalib_legit.csv /app/autocalib_legit.csv
|
| 27 |
-
COPY known_hosts.csv /app/known_hosts.csv
|
| 28 |
|
| 29 |
EXPOSE 7860
|
| 30 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 20 |
COPY requirements.txt /app/requirements.txt
|
| 21 |
RUN pip install -r /app/requirements.txt
|
| 22 |
|
|
|
|
| 23 |
COPY app.py /app/app.py
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
EXPOSE 7860
|
| 26 |
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -13,7 +13,6 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
|
|
| 13 |
- `phishing_probability` is always the raw probability of phishing (0..1)
|
| 14 |
- `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
|
| 15 |
- `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
|
| 16 |
-
- Also includes `predicted_label` (0→LEGIT, 1→PHISH) aligned to dataset polarity, and `raw_proba_class1` for debugging
|
| 17 |
|
| 18 |
## Files
|
| 19 |
- Dockerfile - builds a small FastAPI server image
|
|
@@ -27,9 +26,6 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
|
|
| 27 |
- MODEL_ID = Perth0603/phishing-email-mobilebert
|
| 28 |
- URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
|
| 29 |
- URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
|
| 30 |
-
- Alternatively use: HF_URL_MODEL_ID, HF_URL_REPO_TYPE, HF_URL_FILENAME
|
| 31 |
-
- Optional: AUTOCALIB_PHISHY_CSV, AUTOCALIB_LEGIT_CSV, KNOWN_HOSTS_CSV
|
| 32 |
-
- Optional: URL_POSITIVE_CLASS (PHISH or LEGIT)
|
| 33 |
4. Wait for the Space to build and become green. Test:
|
| 34 |
- GET `/` should return `{ status: ok, model: ... }`
|
| 35 |
- POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
|
|
@@ -46,30 +42,3 @@ Run the app:
|
|
| 46 |
```
|
| 47 |
flutter run --dart-define-from-file=hf.env.json
|
| 48 |
```
|
| 49 |
-
|
| 50 |
-
## CSV configuration
|
| 51 |
-
|
| 52 |
-
You can provide CSV files to customize autocalibration URLs and known host overrides.
|
| 53 |
-
|
| 54 |
-
Formats:
|
| 55 |
-
|
| 56 |
-
```
|
| 57 |
-
# autocalib_phishy.csv
|
| 58 |
-
url
|
| 59 |
-
http://198.51.100.23/login/update?acc=123
|
| 60 |
-
http://secure-login-account-update.example.com/session?id=123
|
| 61 |
-
```
|
| 62 |
-
|
| 63 |
-
```
|
| 64 |
-
# autocalib_legit.csv
|
| 65 |
-
url
|
| 66 |
-
https://www.wikipedia.org/
|
| 67 |
-
https://www.python.org/
|
| 68 |
-
```
|
| 69 |
-
|
| 70 |
-
```
|
| 71 |
-
# known_hosts.csv
|
| 72 |
-
host,label
|
| 73 |
-
cjplogger.com,LEGIT
|
| 74 |
-
bad-login-update.example.com,PHISH
|
| 75 |
-
```
|
|
|
|
| 13 |
- `phishing_probability` is always the raw probability of phishing (0..1)
|
| 14 |
- `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
|
| 15 |
- `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
|
|
|
|
| 16 |
|
| 17 |
## Files
|
| 18 |
- Dockerfile - builds a small FastAPI server image
|
|
|
|
| 26 |
- MODEL_ID = Perth0603/phishing-email-mobilebert
|
| 27 |
- URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
|
| 28 |
- URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
|
|
|
|
|
|
|
|
|
|
| 29 |
4. Wait for the Space to build and become green. Test:
|
| 30 |
- GET `/` should return `{ status: ok, model: ... }`
|
| 31 |
- POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
|
|
|
|
| 42 |
```
|
| 43 |
flutter run --dart-define-from-file=hf.env.json
|
| 44 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -6,7 +6,6 @@ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
|
|
| 6 |
os.environ.setdefault("TORCH_HOME", "/data/.cache")
|
| 7 |
|
| 8 |
from typing import Optional, List, Dict, Any
|
| 9 |
-
import csv
|
| 10 |
from urllib.parse import urlparse
|
| 11 |
import threading
|
| 12 |
import re
|
|
@@ -73,113 +72,64 @@ _url_phish_is_positive: Optional[bool] = None
|
|
| 73 |
# -------------------------
|
| 74 |
# You can edit these lists to define which URLs are considered obviously phishy/legit
|
| 75 |
# for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
if "host" in fields and "label" in fields:
|
| 135 |
-
for row in reader:
|
| 136 |
-
host = str(row.get("host", "")).strip().lower()
|
| 137 |
-
label = str(row.get("label", "")).strip().upper()
|
| 138 |
-
if host and label in ("PHISH", "LEGIT"):
|
| 139 |
-
host_to_label[host] = label
|
| 140 |
-
else:
|
| 141 |
-
f.seek(0)
|
| 142 |
-
f2 = csv.reader(f)
|
| 143 |
-
for row in f2:
|
| 144 |
-
if len(row) < 2:
|
| 145 |
-
continue
|
| 146 |
-
host = str(row[0]).strip().lower()
|
| 147 |
-
label = str(row[1]).strip().upper()
|
| 148 |
-
if host.lower() == "host" and label == "LABEL":
|
| 149 |
-
continue
|
| 150 |
-
if host and label in ("PHISH", "LEGIT"):
|
| 151 |
-
host_to_label[host] = label
|
| 152 |
-
except Exception as e:
|
| 153 |
-
print(f"[csv] failed reading hosts from {path}: {e}")
|
| 154 |
-
return host_to_label
|
| 155 |
-
|
| 156 |
-
def _load_csv_configs_if_any():
|
| 157 |
-
base_dir = os.path.dirname(__file__)
|
| 158 |
-
phishy_csv = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(base_dir, "autocalib_phishy.csv"))
|
| 159 |
-
legit_csv = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(base_dir, "autocalib_legit.csv"))
|
| 160 |
-
hosts_csv = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(base_dir, "known_hosts.csv"))
|
| 161 |
-
|
| 162 |
-
if os.path.exists(phishy_csv):
|
| 163 |
-
urls = _read_urls_from_csv(phishy_csv)
|
| 164 |
-
if urls:
|
| 165 |
-
print(f"[csv] loaded phishy URLs: {len(urls)} from {phishy_csv}")
|
| 166 |
-
_AUTOCALIB_PHISHY_URLS[:] = urls
|
| 167 |
-
if os.path.exists(legit_csv):
|
| 168 |
-
urls = _read_urls_from_csv(legit_csv)
|
| 169 |
-
if urls:
|
| 170 |
-
print(f"[csv] loaded legit URLs: {len(urls)} from {legit_csv}")
|
| 171 |
-
_AUTOCALIB_LEGIT_URLS[:] = urls
|
| 172 |
-
if os.path.exists(hosts_csv):
|
| 173 |
-
mapping = _read_hosts_from_csv(hosts_csv)
|
| 174 |
-
if mapping:
|
| 175 |
-
print(f"[csv] loaded known hosts: {len(mapping)} from {hosts_csv}")
|
| 176 |
-
_KNOWN_LEGIT_HOSTS.clear()
|
| 177 |
-
_KNOWN_PHISH_HOSTS.clear()
|
| 178 |
-
for host, label in mapping.items():
|
| 179 |
-
if label == "LEGIT":
|
| 180 |
-
_KNOWN_LEGIT_HOSTS.append(host)
|
| 181 |
-
elif label == "PHISH":
|
| 182 |
-
_KNOWN_PHISH_HOSTS.append(host)
|
| 183 |
|
| 184 |
# -------------------------
|
| 185 |
# URL features (must match training)
|
|
@@ -279,21 +229,6 @@ def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[st
|
|
| 279 |
|
| 280 |
phishy = _AUTOCALIB_PHISHY_URLS
|
| 281 |
legit = _AUTOCALIB_LEGIT_URLS
|
| 282 |
-
# Guard: if CSVs are empty, fall back to safe defaults
|
| 283 |
-
if not phishy:
|
| 284 |
-
phishy = [
|
| 285 |
-
"http://198.51.100.23/login/update?acc=123",
|
| 286 |
-
"http://secure-login-account-update.example.com/session?id=123",
|
| 287 |
-
"http://bank.verify-update-security.com/confirm",
|
| 288 |
-
"http://paypal.com.account-verify.cn/login",
|
| 289 |
-
]
|
| 290 |
-
if not legit:
|
| 291 |
-
legit = [
|
| 292 |
-
"https://www.wikipedia.org/",
|
| 293 |
-
"https://www.python.org/",
|
| 294 |
-
"https://www.microsoft.com/",
|
| 295 |
-
"https://www.openai.com/",
|
| 296 |
-
]
|
| 297 |
|
| 298 |
model = bundle.get("model")
|
| 299 |
model_type: str = str(bundle.get("model_type") or "")
|
|
@@ -359,8 +294,6 @@ def _startup():
|
|
| 359 |
print(f"[startup] text model load failed: {e}")
|
| 360 |
try:
|
| 361 |
_load_url_model()
|
| 362 |
-
# Load CSV-based config if present
|
| 363 |
-
_load_csv_configs_if_any()
|
| 364 |
global _url_phish_is_positive
|
| 365 |
b = _url_bundle
|
| 366 |
if isinstance(b, dict) and _url_phish_is_positive is None:
|
|
@@ -380,20 +313,6 @@ def _startup():
|
|
| 380 |
def root():
|
| 381 |
return {"status": "ok", "model": MODEL_ID}
|
| 382 |
|
| 383 |
-
@app.get("/debug-config")
|
| 384 |
-
def debug_config():
|
| 385 |
-
return {
|
| 386 |
-
"phishy_count": len(_AUTOCALIB_PHISHY_URLS),
|
| 387 |
-
"legit_count": len(_AUTOCALIB_LEGIT_URLS),
|
| 388 |
-
"known_legit_hosts": _KNOWN_LEGIT_HOSTS[:50],
|
| 389 |
-
"known_phish_hosts": _KNOWN_PHISH_HOSTS[:50],
|
| 390 |
-
"url_repo": URL_REPO,
|
| 391 |
-
"url_repo_type": URL_REPO_TYPE,
|
| 392 |
-
"url_filename": URL_FILENAME,
|
| 393 |
-
"phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
|
| 394 |
-
"resolved_phish_is_positive": _url_phish_is_positive,
|
| 395 |
-
}
|
| 396 |
-
|
| 397 |
@app.post("/predict")
|
| 398 |
def predict(payload: PredictPayload):
|
| 399 |
try:
|
|
@@ -468,9 +387,9 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 468 |
host = (urlparse(url_str).hostname or "").lower()
|
| 469 |
if host:
|
| 470 |
override_label: Optional[str] = None
|
| 471 |
-
if
|
| 472 |
override_label = "LEGIT"
|
| 473 |
-
elif
|
| 474 |
override_label = "PHISH"
|
| 475 |
if override_label is not None:
|
| 476 |
# Map numeric label according to resolved polarity
|
|
|
|
| 6 |
os.environ.setdefault("TORCH_HOME", "/data/.cache")
|
| 7 |
|
| 8 |
from typing import Optional, List, Dict, Any
|
|
|
|
| 9 |
from urllib.parse import urlparse
|
| 10 |
import threading
|
| 11 |
import re
|
|
|
|
| 72 |
# -------------------------
|
| 73 |
# You can edit these lists to define which URLs are considered obviously phishy/legit
|
| 74 |
# for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
|
| 75 |
+
_AUTOCALIB_PHISHY_URLS: List[str] = [
|
| 76 |
+
"http://198.51.100.23/login/update?acc=123",
|
| 77 |
+
"http://secure-login-account-update.example.com/session?id=123",
|
| 78 |
+
"http://bank.verify-update-security.com/confirm",
|
| 79 |
+
"http://paypal.com.account-verify.cn/login",
|
| 80 |
+
"http://abc.xyz/downloads/invoice.exe",
|
| 81 |
+
"http://203.0.113.45/verify/account",
|
| 82 |
+
"http://login-secure-update.example.net/confirm",
|
| 83 |
+
"http://paypal.com.verify.bill.cn/login",
|
| 84 |
+
"http://account-update-security-pay.example.org/verify",
|
| 85 |
+
"http://secure-login-microsoft.example.info/reset",
|
| 86 |
+
"http://login.verify-paypal.support-id.example.com/",
|
| 87 |
+
"http://dropbox.com.security-alert.example.net/login",
|
| 88 |
+
"http://bankofamerica.secure-update.example.co/verify",
|
| 89 |
+
"http://icloud.apple.com.signin.security-alert.example.co/login",
|
| 90 |
+
"http://google.com.accounts.security-check.example.xyz/signin",
|
| 91 |
+
"http://update-billing-info.example-downloads.com/invoice.zip",
|
| 92 |
+
"http://download.secure-update.example.com/app.exe",
|
| 93 |
+
"http://192.0.2.10/secure/login",
|
| 94 |
+
"http://198.51.100.50/account/verify?session=abc",
|
| 95 |
+
"http://example.com@evil.com/login",
|
| 96 |
+
"http://secure.example.com-login.verify.co/reset",
|
| 97 |
+
"http://support-paypal.example.co.uk.refund.cn/login",
|
| 98 |
+
"http://microsoft.account-security.example.ru/update",
|
| 99 |
+
"http://amazon.verify-order.example.top/confirm",
|
| 100 |
+
"http://webscr.paypal.example.phish/login",
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
_AUTOCALIB_LEGIT_URLS: List[str] = [
|
| 104 |
+
"https://www.wikipedia.org/",
|
| 105 |
+
"https://www.microsoft.com/",
|
| 106 |
+
"https://www.openai.com/",
|
| 107 |
+
"https://www.python.org/",
|
| 108 |
+
"https://www.gov.uk/",
|
| 109 |
+
"https://www.google.com/",
|
| 110 |
+
"https://www.apple.com/",
|
| 111 |
+
"https://github.com/",
|
| 112 |
+
"https://stackoverflow.com/",
|
| 113 |
+
"https://www.bbc.com/",
|
| 114 |
+
"https://www.nytimes.com/",
|
| 115 |
+
"https://www.nasa.gov/",
|
| 116 |
+
"https://www.mozilla.org/",
|
| 117 |
+
"https://www.cloudflare.com/",
|
| 118 |
+
"https://www.reddit.com/",
|
| 119 |
+
"https://www.linkedin.com/",
|
| 120 |
+
"https://www.youtube.com/",
|
| 121 |
+
"https://developer.apple.com/",
|
| 122 |
+
"https://aws.amazon.com/",
|
| 123 |
+
"https://azure.microsoft.com/",
|
| 124 |
+
]
|
| 125 |
+
|
| 126 |
+
# Known host overrides (editable): force certain domains as LEGIT or PHISH
|
| 127 |
+
_KNOWN_LEGIT_HOSTS: List[str] = [
|
| 128 |
+
"cjplogger.com",
|
| 129 |
+
"www.cjplogger.com",
|
| 130 |
+
]
|
| 131 |
+
_KNOWN_PHISH_HOSTS: List[str] = [
|
| 132 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
# -------------------------
|
| 135 |
# URL features (must match training)
|
|
|
|
| 229 |
|
| 230 |
phishy = _AUTOCALIB_PHISHY_URLS
|
| 231 |
legit = _AUTOCALIB_LEGIT_URLS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
model = bundle.get("model")
|
| 234 |
model_type: str = str(bundle.get("model_type") or "")
|
|
|
|
| 294 |
print(f"[startup] text model load failed: {e}")
|
| 295 |
try:
|
| 296 |
_load_url_model()
|
|
|
|
|
|
|
| 297 |
global _url_phish_is_positive
|
| 298 |
b = _url_bundle
|
| 299 |
if isinstance(b, dict) and _url_phish_is_positive is None:
|
|
|
|
| 313 |
def root():
|
| 314 |
return {"status": "ok", "model": MODEL_ID}
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
@app.post("/predict")
|
| 317 |
def predict(payload: PredictPayload):
|
| 318 |
try:
|
|
|
|
| 387 |
host = (urlparse(url_str).hostname or "").lower()
|
| 388 |
if host:
|
| 389 |
override_label: Optional[str] = None
|
| 390 |
+
if host in _KNOWN_LEGIT_HOSTS:
|
| 391 |
override_label = "LEGIT"
|
| 392 |
+
elif host in _KNOWN_PHISH_HOSTS:
|
| 393 |
override_label = "PHISH"
|
| 394 |
if override_label is not None:
|
| 395 |
# Map numeric label according to resolved polarity
|