Upload app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
|
|
| 6 |
os.environ.setdefault("TORCH_HOME", "/data/.cache")
|
| 7 |
|
| 8 |
from typing import Optional, List, Dict, Any
|
|
|
|
| 9 |
import threading
|
| 10 |
import re
|
| 11 |
import numpy as np
|
|
@@ -87,6 +88,14 @@ _AUTOCALIB_LEGIT_URLS: List[str] = [
|
|
| 87 |
"https://www.gov.uk/",
|
| 88 |
]
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
# -------------------------
|
| 91 |
# URL features (must match training)
|
| 92 |
# -------------------------
|
|
@@ -339,6 +348,37 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 339 |
"phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
|
| 340 |
}
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
raw_p_class1_debug: Optional[float] = None
|
| 343 |
|
| 344 |
if isinstance(model_type, str) and model_type == "xgboost_bst":
|
|
|
|
| 6 |
os.environ.setdefault("TORCH_HOME", "/data/.cache")
|
| 7 |
|
| 8 |
from typing import Optional, List, Dict, Any
|
| 9 |
+
from urllib.parse import urlparse
|
| 10 |
import threading
|
| 11 |
import re
|
| 12 |
import numpy as np
|
|
|
|
| 88 |
"https://www.gov.uk/",
|
| 89 |
]
|
| 90 |
|
| 91 |
+
# Known host overrides (editable): force certain domains as LEGIT or PHISH
|
| 92 |
+
_KNOWN_LEGIT_HOSTS: List[str] = [
|
| 93 |
+
"cjplogger.com",
|
| 94 |
+
"www.cjplogger.com",
|
| 95 |
+
]
|
| 96 |
+
_KNOWN_PHISH_HOSTS: List[str] = [
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
# -------------------------
|
| 100 |
# URL features (must match training)
|
| 101 |
# -------------------------
|
|
|
|
| 348 |
"phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
|
| 349 |
}
|
| 350 |
|
| 351 |
+
# Known-domain override after polarity is resolved
|
| 352 |
+
host = (urlparse(url_str).hostname or "").lower()
|
| 353 |
+
if host:
|
| 354 |
+
override_label: Optional[str] = None
|
| 355 |
+
if host in _KNOWN_LEGIT_HOSTS:
|
| 356 |
+
override_label = "LEGIT"
|
| 357 |
+
elif host in _KNOWN_PHISH_HOSTS:
|
| 358 |
+
override_label = "PHISH"
|
| 359 |
+
if override_label is not None:
|
| 360 |
+
# Map numeric label according to resolved polarity
|
| 361 |
+
predicted_label_numeric = 1 if ((override_label == "PHISH") == bool(phish_is_positive)) else 0
|
| 362 |
+
phish_proba_override = 0.99 if override_label == "PHISH" else 0.01
|
| 363 |
+
score_override = phish_proba_override if override_label == "PHISH" else (1.0 - phish_proba_override)
|
| 364 |
+
return {
|
| 365 |
+
"label": override_label,
|
| 366 |
+
"predicted_label": int(predicted_label_numeric),
|
| 367 |
+
"score": float(score_override),
|
| 368 |
+
"phishing_probability": float(phish_proba_override),
|
| 369 |
+
"backend": str(model_type),
|
| 370 |
+
"threshold": 0.5,
|
| 371 |
+
"override": {
|
| 372 |
+
"reason": "known_host",
|
| 373 |
+
"host": host,
|
| 374 |
+
},
|
| 375 |
+
"phish_is_positive": bool(phish_is_positive),
|
| 376 |
+
"phish_is_positive_bundle": meta_phish_is_positive,
|
| 377 |
+
"phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
|
| 378 |
+
"feature_cols": feature_cols,
|
| 379 |
+
"url_col": url_col,
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
raw_p_class1_debug: Optional[float] = None
|
| 383 |
|
| 384 |
if isinstance(model_type, str) and model_type == "xgboost_bst":
|