Perth0603 commited on
Commit
d506ae1
·
verified ·
1 Parent(s): 016c0e8

Upload 7 files

Browse files
Files changed (4) hide show
  1. app.py +30 -63
  2. autocalib_legit.csv +22 -0
  3. autocalib_phishy.csv +27 -0
  4. known_hosts.csv +4 -0
app.py CHANGED
@@ -69,68 +69,19 @@ _url_lock = threading.Lock()
69
  _url_phish_is_positive: Optional[bool] = None
70
 
71
  # -------------------------
72
- # Autocalibration URL prototypes (editable)
73
  # -------------------------
74
- # You can edit these lists to define which URLs are considered obviously phishy/legit
75
- # for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
76
- _AUTOCALIB_PHISHY_URLS: List[str] = [
77
- "http://198.51.100.23/login/update?acc=123",
78
- "http://secure-login-account-update.example.com/session?id=123",
79
- "http://bank.verify-update-security.com/confirm",
80
- "http://paypal.com.account-verify.cn/login",
81
- "http://abc.xyz/downloads/invoice.exe",
82
- "http://203.0.113.45/verify/account",
83
- "http://login-secure-update.example.net/confirm",
84
- "http://paypal.com.verify.bill.cn/login",
85
- "http://account-update-security-pay.example.org/verify",
86
- "http://secure-login-microsoft.example.info/reset",
87
- "http://login.verify-paypal.support-id.example.com/",
88
- "http://dropbox.com.security-alert.example.net/login",
89
- "http://bankofamerica.secure-update.example.co/verify",
90
- "http://icloud.apple.com.signin.security-alert.example.co/login",
91
- "http://google.com.accounts.security-check.example.xyz/signin",
92
- "http://update-billing-info.example-downloads.com/invoice.zip",
93
- "http://download.secure-update.example.com/app.exe",
94
- "http://192.0.2.10/secure/login",
95
- "http://198.51.100.50/account/verify?session=abc",
96
- "http://example.com@evil.com/login",
97
- "http://secure.example.com-login.verify.co/reset",
98
- "http://support-paypal.example.co.uk.refund.cn/login",
99
- "http://microsoft.account-security.example.ru/update",
100
- "http://amazon.verify-order.example.top/confirm",
101
- "http://webscr.paypal.example.phish/login",
102
- ]
103
-
104
- _AUTOCALIB_LEGIT_URLS: List[str] = [
105
- "https://www.wikipedia.org/",
106
- "https://www.microsoft.com/",
107
- "https://www.openai.com/",
108
- "https://www.python.org/",
109
- "https://www.gov.uk/",
110
- "https://www.google.com/",
111
- "https://www.apple.com/",
112
- "https://github.com/",
113
- "https://stackoverflow.com/",
114
- "https://www.bbc.com/",
115
- "https://www.nytimes.com/",
116
- "https://www.nasa.gov/",
117
- "https://www.mozilla.org/",
118
- "https://www.cloudflare.com/",
119
- "https://www.reddit.com/",
120
- "https://www.linkedin.com/",
121
- "https://www.youtube.com/",
122
- "https://developer.apple.com/",
123
- "https://aws.amazon.com/",
124
- "https://azure.microsoft.com/",
125
- ]
126
-
127
- # Known host overrides (editable): force certain domains as LEGIT or PHISH
128
- _KNOWN_LEGIT_HOSTS: List[str] = [
129
- "cjplogger.com",
130
- "www.cjplogger.com",
131
- ]
132
- _KNOWN_PHISH_HOSTS: List[str] = [
133
- ]
134
 
135
  # Helpers to normalize and match hosts by suffix (handles subdomains)
136
  def _normalize_host(value: str) -> str:
@@ -327,6 +278,22 @@ def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[st
327
 
328
  phishy = _AUTOCALIB_PHISHY_URLS
329
  legit = _AUTOCALIB_LEGIT_URLS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  model = bundle.get("model")
332
  model_type: str = str(bundle.get("model_type") or "")
@@ -489,9 +456,9 @@ def predict_url(payload: PredictUrlPayload):
489
  host = (urlparse(url_str).hostname or "").lower()
490
  if host:
491
  override_label: Optional[str] = None
492
- if host in _KNOWN_LEGIT_HOSTS:
493
  override_label = "LEGIT"
494
- elif host in _KNOWN_PHISH_HOSTS:
495
  override_label = "PHISH"
496
  if override_label is not None:
497
  # Map numeric label according to resolved polarity
 
69
  _url_phish_is_positive: Optional[bool] = None
70
 
71
  # -------------------------
72
+ # Autocalibration URL prototypes (CSV-driven)
73
  # -------------------------
74
+ # Provide CSV files for calibration lists to avoid code edits:
75
+ # - AUTOCALIB_PHISHY_CSV (default hf_space/autocalib_phishy.csv)
76
+ # - AUTOCALIB_LEGIT_CSV (default hf_space/autocalib_legit.csv)
77
+ # These lists are loaded at startup and before each request (hot-reload safe).
78
+ _AUTOCALIB_PHISHY_URLS: List[str] = []
79
+ _AUTOCALIB_LEGIT_URLS: List[str] = []
80
+
81
+ # Known host overrides via CSV (suffix-matched):
82
+ # - KNOWN_HOSTS_CSV (default hf_space/known_hosts.csv) with columns host,label
83
+ _KNOWN_LEGIT_HOSTS: List[str] = []
84
+ _KNOWN_PHISH_HOSTS: List[str] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # Helpers to normalize and match hosts by suffix (handles subdomains)
87
  def _normalize_host(value: str) -> str:
 
278
 
279
  phishy = _AUTOCALIB_PHISHY_URLS
280
  legit = _AUTOCALIB_LEGIT_URLS
281
+ # Safe fallback if CSVs are missing/empty
282
+ if not phishy:
283
+ phishy = [
284
+ "http://198.51.100.23/login/update?acc=123",
285
+ "http://secure-login-account-update.example.com/session?id=123",
286
+ "http://bank.verify-update-security.com/confirm",
287
+ "http://paypal.com.account-verify.cn/login",
288
+ "http://abc.xyz/downloads/invoice.exe",
289
+ ]
290
+ if not legit:
291
+ legit = [
292
+ "https://www.wikipedia.org/",
293
+ "https://www.microsoft.com/",
294
+ "https://www.python.org/",
295
+ "https://www.openai.com/",
296
+ ]
297
 
298
  model = bundle.get("model")
299
  model_type: str = str(bundle.get("model_type") or "")
 
456
  host = (urlparse(url_str).hostname or "").lower()
457
  if host:
458
  override_label: Optional[str] = None
459
+ if _host_matches_any(host, _KNOWN_LEGIT_HOSTS):
460
  override_label = "LEGIT"
461
+ elif _host_matches_any(host, _KNOWN_PHISH_HOSTS):
462
  override_label = "PHISH"
463
  if override_label is not None:
464
  # Map numeric label according to resolved polarity
autocalib_legit.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ url
2
+ https://www.wikipedia.org/
3
+ https://www.microsoft.com/
4
+ https://www.openai.com/
5
+ https://www.python.org/
6
+ https://www.gov.uk/
7
+ https://www.google.com/
8
+ https://www.apple.com/
9
+ https://github.com/
10
+ https://stackoverflow.com/
11
+ https://www.bbc.com/
12
+ https://www.nytimes.com/
13
+ https://www.nasa.gov/
14
+ https://www.mozilla.org/
15
+ https://www.cloudflare.com/
16
+ https://www.reddit.com/
17
+ https://www.linkedin.com/
18
+ https://www.youtube.com/
19
+ https://developer.apple.com/
20
+ https://aws.amazon.com/
21
+ https://azure.microsoft.com/
22
+
autocalib_phishy.csv ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ url
2
+ http://198.51.100.23/login/update?acc=123
3
+ http://secure-login-account-update.example.com/session?id=123
4
+ http://bank.verify-update-security.com/confirm
5
+ http://paypal.com.account-verify.cn/login
6
+ http://abc.xyz/downloads/invoice.exe
7
+ http://203.0.113.45/verify/account
8
+ http://login-secure-update.example.net/confirm
9
+ http://paypal.com.verify.bill.cn/login
10
+ http://account-update-security-pay.example.org/verify
11
+ http://secure-login-microsoft.example.info/reset
12
+ http://login.verify-paypal.support-id.example.com/
13
+ http://dropbox.com.security-alert.example.net/login
14
+ http://bankofamerica.secure-update.example.co/verify
15
+ http://icloud.apple.com.signin.security-alert.example.co/login
16
+ http://google.com.accounts.security-check.example.xyz/signin
17
+ http://update-billing-info.example-downloads.com/invoice.zip
18
+ http://download.secure-update.example.com/app.exe
19
+ http://192.0.2.10/secure/login
20
+ http://198.51.100.50/account/verify?session=abc
21
+ http://example.com@evil.com/login
22
+ http://secure.example.com-login.verify.co/reset
23
+ http://support-paypal.example.co.uk.refund.cn/login
24
+ http://microsoft.account-security.example.ru/update
25
+ http://amazon.verify-order.example.top/confirm
26
+ http://webscr.paypal.example.phish/login
27
+
known_hosts.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ host,label
2
+ cjplogger.com,LEGIT
3
+ www.cjplogger.com,LEGIT
4
+