Perth0603 commited on
Commit
b16cfad
·
verified ·
1 Parent(s): 99ed65e

Upload 7 files

Browse files
Files changed (5) hide show
  1. README.md +31 -0
  2. app.py +111 -23
  3. autocalib_legit.csv +27 -0
  4. autocalib_phishy.csv +19 -0
  5. known_hosts.csv +14 -0
README.md CHANGED
@@ -13,6 +13,7 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
13
  - `phishing_probability` is always the raw probability of phishing (0..1)
14
  - `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
15
  - `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
 
16
 
17
  ## Files
18
  - Dockerfile - builds a small FastAPI server image
@@ -26,6 +27,9 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
26
  - MODEL_ID = Perth0603/phishing-email-mobilebert
27
  - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
28
  - URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
 
 
 
29
  4. Wait for the Space to build and become green. Test:
30
  - GET `/` should return `{ status: ok, model: ... }`
31
  - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
@@ -42,3 +46,30 @@ Run the app:
42
  ```
43
  flutter run --dart-define-from-file=hf.env.json
44
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  - `phishing_probability` is always the raw probability of phishing (0..1)
14
  - `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
15
  - `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
16
+ - Also includes `predicted_label` (0→LEGIT, 1→PHISH) aligned to dataset polarity, and `raw_proba_class1` for debugging
17
 
18
  ## Files
19
  - Dockerfile - builds a small FastAPI server image
 
27
  - MODEL_ID = Perth0603/phishing-email-mobilebert
28
  - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
29
  - URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
30
+ - Alternatively use: HF_URL_MODEL_ID, HF_URL_REPO_TYPE, HF_URL_FILENAME
31
+ - Optional: AUTOCALIB_PHISHY_CSV, AUTOCALIB_LEGIT_CSV, KNOWN_HOSTS_CSV
32
+ - Optional: URL_POSITIVE_CLASS (PHISH or LEGIT)
33
  4. Wait for the Space to build and become green. Test:
34
  - GET `/` should return `{ status: ok, model: ... }`
35
  - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
 
46
  ```
47
  flutter run --dart-define-from-file=hf.env.json
48
  ```
49
+
50
+ ## CSV configuration
51
+
52
+ You can provide CSV files to customize autocalibration URLs and known host overrides.
53
+
54
+ Formats:
55
+
56
+ ```
57
+ # autocalib_phishy.csv
58
+ url
59
+ http://198.51.100.23/login/update?acc=123
60
+ http://secure-login-account-update.example.com/session?id=123
61
+ ```
62
+
63
+ ```
64
+ # autocalib_legit.csv
65
+ url
66
+ https://www.wikipedia.org/
67
+ https://www.python.org/
68
+ ```
69
+
70
+ ```
71
+ # known_hosts.csv
72
+ host,label
73
+ cjplogger.com,LEGIT
74
+ bad-login-update.example.com,PHISH
75
+ ```
app.py CHANGED
@@ -6,6 +6,7 @@ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
6
  os.environ.setdefault("TORCH_HOME", "/data/.cache")
7
 
8
  from typing import Optional, List, Dict, Any
 
9
  from urllib.parse import urlparse
10
  import threading
11
  import re
@@ -72,29 +73,99 @@ _url_phish_is_positive: Optional[bool] = None
72
  # -------------------------
73
  # You can edit these lists to define which URLs are considered obviously phishy/legit
74
  # for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
75
- _AUTOCALIB_PHISHY_URLS: List[str] = [
76
- "http://198.51.100.23/login/update?acc=123",
77
- "http://secure-login-account-update.example.com/session?id=123",
78
- "http://bank.verify-update-security.com/confirm",
79
- "http://paypal.com.account-verify.cn/login",
80
- "http://abc.xyz/downloads/invoice.exe",
81
- ]
82
-
83
- _AUTOCALIB_LEGIT_URLS: List[str] = [
84
- "https://www.wikipedia.org/",
85
- "https://www.microsoft.com/",
86
- "https://www.openai.com/",
87
- "https://www.python.org/",
88
- "https://www.gov.uk/",
89
- ]
90
-
91
- # Known host overrides (editable): force certain domains as LEGIT or PHISH
92
- _KNOWN_LEGIT_HOSTS: List[str] = [
93
- "cjplogger.com",
94
- "www.cjplogger.com",
95
- ]
96
- _KNOWN_PHISH_HOSTS: List[str] = [
97
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  # -------------------------
100
  # URL features (must match training)
@@ -194,6 +265,21 @@ def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[st
194
 
195
  phishy = _AUTOCALIB_PHISHY_URLS
196
  legit = _AUTOCALIB_LEGIT_URLS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  model = bundle.get("model")
199
  model_type: str = str(bundle.get("model_type") or "")
@@ -259,6 +345,8 @@ def _startup():
259
  print(f"[startup] text model load failed: {e}")
260
  try:
261
  _load_url_model()
 
 
262
  global _url_phish_is_positive
263
  b = _url_bundle
264
  if isinstance(b, dict) and _url_phish_is_positive is None:
 
6
  os.environ.setdefault("TORCH_HOME", "/data/.cache")
7
 
8
  from typing import Optional, List, Dict, Any
9
+ import csv
10
  from urllib.parse import urlparse
11
  import threading
12
  import re
 
73
  # -------------------------
74
  # You can edit these lists to define which URLs are considered obviously phishy/legit
75
  # for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
76
+ # Loaded from CSV. Provide via AUTOCALIB_PHISHY_CSV or hf_space/autocalib_phishy.csv
77
+ _AUTOCALIB_PHISHY_URLS: List[str] = []
78
+
79
+ # Loaded from CSV. Provide via AUTOCALIB_LEGIT_CSV or hf_space/autocalib_legit.csv
80
+ _AUTOCALIB_LEGIT_URLS: List[str] = []
81
+
82
+ # Known host overrides (CSV-driven): hf_space/known_hosts.csv or KNOWN_HOSTS_CSV
83
+ _KNOWN_LEGIT_HOSTS: List[str] = []
84
+ _KNOWN_PHISH_HOSTS: List[str] = []
85
+
86
+ # -------------------------
87
+ # CSV configuration support (optional)
88
+ # -------------------------
89
+ def _read_urls_from_csv(path: str) -> List[str]:
90
+ urls: List[str] = []
91
+ try:
92
+ with open(path, newline="", encoding="utf-8") as f:
93
+ reader = csv.DictReader(f)
94
+ if "url" in (reader.fieldnames or []):
95
+ for row in reader:
96
+ val = str(row.get("url", "")).strip()
97
+ if val:
98
+ urls.append(val)
99
+ else:
100
+ f.seek(0)
101
+ f2 = csv.reader(f)
102
+ for row in f2:
103
+ if not row:
104
+ continue
105
+ val = str(row[0]).strip()
106
+ if val.lower() == "url":
107
+ continue
108
+ if val:
109
+ urls.append(val)
110
+ except Exception as e:
111
+ print(f"[csv] failed reading URLs from {path}: {e}")
112
+ return urls
113
+
114
+ def _read_hosts_from_csv(path: str) -> Dict[str, str]:
115
+ host_to_label: Dict[str, str] = {}
116
+ try:
117
+ with open(path, newline="", encoding="utf-8") as f:
118
+ reader = csv.DictReader(f)
119
+ fields = [x.lower() for x in (reader.fieldnames or [])]
120
+ if "host" in fields and "label" in fields:
121
+ for row in reader:
122
+ host = str(row.get("host", "")).strip().lower()
123
+ label = str(row.get("label", "")).strip().upper()
124
+ if host and label in ("PHISH", "LEGIT"):
125
+ host_to_label[host] = label
126
+ else:
127
+ f.seek(0)
128
+ f2 = csv.reader(f)
129
+ for row in f2:
130
+ if len(row) < 2:
131
+ continue
132
+ host = str(row[0]).strip().lower()
133
+ label = str(row[1]).strip().upper()
134
+ if host.lower() == "host" and label == "LABEL":
135
+ continue
136
+ if host and label in ("PHISH", "LEGIT"):
137
+ host_to_label[host] = label
138
+ except Exception as e:
139
+ print(f"[csv] failed reading hosts from {path}: {e}")
140
+ return host_to_label
141
+
142
+ def _load_csv_configs_if_any():
143
+ base_dir = os.path.dirname(__file__)
144
+ phishy_csv = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(base_dir, "autocalib_phishy.csv"))
145
+ legit_csv = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(base_dir, "autocalib_legit.csv"))
146
+ hosts_csv = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(base_dir, "known_hosts.csv"))
147
+
148
+ if os.path.exists(phishy_csv):
149
+ urls = _read_urls_from_csv(phishy_csv)
150
+ if urls:
151
+ print(f"[csv] loaded phishy URLs: {len(urls)} from {phishy_csv}")
152
+ _AUTOCALIB_PHISHY_URLS[:] = urls
153
+ if os.path.exists(legit_csv):
154
+ urls = _read_urls_from_csv(legit_csv)
155
+ if urls:
156
+ print(f"[csv] loaded legit URLs: {len(urls)} from {legit_csv}")
157
+ _AUTOCALIB_LEGIT_URLS[:] = urls
158
+ if os.path.exists(hosts_csv):
159
+ mapping = _read_hosts_from_csv(hosts_csv)
160
+ if mapping:
161
+ print(f"[csv] loaded known hosts: {len(mapping)} from {hosts_csv}")
162
+ _KNOWN_LEGIT_HOSTS.clear()
163
+ _KNOWN_PHISH_HOSTS.clear()
164
+ for host, label in mapping.items():
165
+ if label == "LEGIT":
166
+ _KNOWN_LEGIT_HOSTS.append(host)
167
+ elif label == "PHISH":
168
+ _KNOWN_PHISH_HOSTS.append(host)
169
 
170
  # -------------------------
171
  # URL features (must match training)
 
265
 
266
  phishy = _AUTOCALIB_PHISHY_URLS
267
  legit = _AUTOCALIB_LEGIT_URLS
268
+ # Guard: if CSVs are empty, fall back to safe defaults
269
+ if not phishy:
270
+ phishy = [
271
+ "http://198.51.100.23/login/update?acc=123",
272
+ "http://secure-login-account-update.example.com/session?id=123",
273
+ "http://bank.verify-update-security.com/confirm",
274
+ "http://paypal.com.account-verify.cn/login",
275
+ ]
276
+ if not legit:
277
+ legit = [
278
+ "https://www.wikipedia.org/",
279
+ "https://www.python.org/",
280
+ "https://www.microsoft.com/",
281
+ "https://www.openai.com/",
282
+ ]
283
 
284
  model = bundle.get("model")
285
  model_type: str = str(bundle.get("model_type") or "")
 
345
  print(f"[startup] text model load failed: {e}")
346
  try:
347
  _load_url_model()
348
+ # Load CSV-based config if present
349
+ _load_csv_configs_if_any()
350
  global _url_phish_is_positive
351
  b = _url_bundle
352
  if isinstance(b, dict) and _url_phish_is_positive is None:
autocalib_legit.csv ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ url
2
+ https://www.wikipedia.org/
3
+ https://www.microsoft.com/
4
+ https://www.openai.com/
5
+ https://www.python.org/
6
+ https://www.gov.uk/
7
+ https://www.google.com/
8
+ https://www.apple.com/
9
+ https://www.amazon.com/
10
+ https://www.github.com/
11
+ https://stackoverflow.com/
12
+ https://www.nytimes.com/
13
+ https://www.bbc.com/
14
+ https://www.cnn.com/
15
+ https://www.gov.sg/
16
+ https://www.whitehouse.gov/
17
+ https://www.europa.eu/
18
+ https://www.cloudflare.com/
19
+ https://www.dropbox.com/
20
+ https://drive.google.com/
21
+ https://www.paypal.com/
22
+ https://www.facebook.com/
23
+ https://www.linkedin.com/
24
+ https://www.youtube.com/
25
+ https://www.reddit.com/
26
+ http://www.cjplogger.com/
27
+
autocalib_phishy.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ url
2
+ http://198.51.100.23/login/update?acc=123
3
+ http://secure-login-account-update.example.com/session?id=123
4
+ http://bank.verify-update-security.com/confirm
5
+ http://paypal.com.account-verify.cn/login
6
+ http://abc.xyz/downloads/invoice.exe
7
+ http://update-login-security-paypal.com/verify
8
+ http://login-secure-paypa1.com/
9
+ http://verify-account-bankof-usa.example.co/reset
10
+ http://support.microsoft.com.example.net/reset-password
11
+ http://secure.appleid.apple.com.example.co/login
12
+ http://drive-google-com.example.org/share/document?id=123
13
+ http://198.51.100.45/pay/confirm?trx=9988
14
+ http://203.0.113.10/parcel/tracking/update
15
+ http://signin-amazon.example.tk/refund
16
+ http://security-update-facebook.example.in/login
17
+ http://login-secure-outlook.example.biz/
18
+ http://dropbox-login.example.co/downloads/setup.zip
19
+
known_hosts.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ host,label
2
+ cjplogger.com,LEGIT
3
+ www.cjplogger.com,LEGIT
4
+ wikipedia.org,LEGIT
5
+ www.wikipedia.org,LEGIT
6
+ microsoft.com,LEGIT
7
+ www.microsoft.com,LEGIT
8
+ google.com,LEGIT
9
+ www.google.com,LEGIT
10
+ github.com,LEGIT
11
+ www.github.com,LEGIT
12
+ python.org,LEGIT
13
+ www.python.org,LEGIT
14
+