Perth0603 commited on
Commit
e2e3793
·
verified ·
1 Parent(s): cedbf8c

Upload 4 files

Browse files
Files changed (3) hide show
  1. Dockerfile +0 -4
  2. README.md +0 -31
  3. app.py +60 -141
Dockerfile CHANGED
@@ -20,11 +20,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
20
  COPY requirements.txt /app/requirements.txt
21
  RUN pip install -r /app/requirements.txt
22
 
23
- # App code and data files
24
  COPY app.py /app/app.py
25
- COPY autocalib_phishy.csv /app/autocalib_phishy.csv
26
- COPY autocalib_legit.csv /app/autocalib_legit.csv
27
- COPY known_hosts.csv /app/known_hosts.csv
28
 
29
  EXPOSE 7860
30
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
20
  COPY requirements.txt /app/requirements.txt
21
  RUN pip install -r /app/requirements.txt
22
 
 
23
  COPY app.py /app/app.py
 
 
 
24
 
25
  EXPOSE 7860
26
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -13,7 +13,6 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
13
  - `phishing_probability` is always the raw probability of phishing (0..1)
14
  - `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
15
  - `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
16
- - Also includes `predicted_label` (0→LEGIT, 1→PHISH) aligned to dataset polarity, and `raw_proba_class1` for debugging
17
 
18
  ## Files
19
  - Dockerfile - builds a small FastAPI server image
@@ -27,9 +26,6 @@ This Space exposes two endpoints so the Flutter app can call them reliably:
27
  - MODEL_ID = Perth0603/phishing-email-mobilebert
28
  - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
29
  - URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
30
- - Alternatively use: HF_URL_MODEL_ID, HF_URL_REPO_TYPE, HF_URL_FILENAME
31
- - Optional: AUTOCALIB_PHISHY_CSV, AUTOCALIB_LEGIT_CSV, KNOWN_HOSTS_CSV
32
- - Optional: URL_POSITIVE_CLASS (PHISH or LEGIT)
33
  4. Wait for the Space to build and become green. Test:
34
  - GET `/` should return `{ status: ok, model: ... }`
35
  - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
@@ -46,30 +42,3 @@ Run the app:
46
  ```
47
  flutter run --dart-define-from-file=hf.env.json
48
  ```
49
-
50
- ## CSV configuration
51
-
52
- You can provide CSV files to customize autocalibration URLs and known host overrides.
53
-
54
- Formats:
55
-
56
- ```
57
- # autocalib_phishy.csv
58
- url
59
- http://198.51.100.23/login/update?acc=123
60
- http://secure-login-account-update.example.com/session?id=123
61
- ```
62
-
63
- ```
64
- # autocalib_legit.csv
65
- url
66
- https://www.wikipedia.org/
67
- https://www.python.org/
68
- ```
69
-
70
- ```
71
- # known_hosts.csv
72
- host,label
73
- cjplogger.com,LEGIT
74
- bad-login-update.example.com,PHISH
75
- ```
 
13
  - `phishing_probability` is always the raw probability of phishing (0..1)
14
  - `label` is `PHISH` when `phishing_probability >= threshold`, else `LEGIT`
15
  - `score` is the confidence for the predicted label (for `LEGIT`, `score = 1 - phishing_probability`), which lets the app show "Safe Confidence" for legitimate URLs
 
16
 
17
  ## Files
18
  - Dockerfile - builds a small FastAPI server image
 
26
  - MODEL_ID = Perth0603/phishing-email-mobilebert
27
  - URL_REPO = Perth0603/Random-Forest-Model-for-PhishingDetection
28
  - URL_FILENAME = url_rf_model.joblib (set to your artifact filename)
 
 
 
29
  4. Wait for the Space to build and become green. Test:
30
  - GET `/` should return `{ status: ok, model: ... }`
31
  - POST `/predict` with `{ "inputs": "Win an iPhone! Click here" }`
 
42
  ```
43
  flutter run --dart-define-from-file=hf.env.json
44
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -6,7 +6,6 @@ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache")
6
  os.environ.setdefault("TORCH_HOME", "/data/.cache")
7
 
8
  from typing import Optional, List, Dict, Any
9
- import csv
10
  from urllib.parse import urlparse
11
  import threading
12
  import re
@@ -73,113 +72,64 @@ _url_phish_is_positive: Optional[bool] = None
73
  # -------------------------
74
  # You can edit these lists to define which URLs are considered obviously phishy/legit
75
  # for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
76
- # Loaded from CSV. Provide via AUTOCALIB_PHISHY_CSV or hf_space/autocalib_phishy.csv
77
- _AUTOCALIB_PHISHY_URLS: List[str] = []
78
-
79
- # Loaded from CSV. Provide via AUTOCALIB_LEGIT_CSV or hf_space/autocalib_legit.csv
80
- _AUTOCALIB_LEGIT_URLS: List[str] = []
81
-
82
- # Known host overrides (CSV-driven): hf_space/known_hosts.csv or KNOWN_HOSTS_CSV
83
- _KNOWN_LEGIT_HOSTS: List[str] = []
84
- _KNOWN_PHISH_HOSTS: List[str] = []
85
-
86
- def _normalize_host(value: str) -> str:
87
- v = value.strip().lower()
88
- if v.startswith("www."):
89
- v = v[4:]
90
- return v
91
-
92
- def _host_matches_any(host: str, known: List[str]) -> bool:
93
- base = _normalize_host(host)
94
- for item in known:
95
- k = _normalize_host(item)
96
- if base == k or base.endswith("." + k):
97
- return True
98
- return False
99
-
100
- # -------------------------
101
- # CSV configuration support (optional)
102
- # -------------------------
103
- def _read_urls_from_csv(path: str) -> List[str]:
104
- urls: List[str] = []
105
- try:
106
- with open(path, newline="", encoding="utf-8") as f:
107
- reader = csv.DictReader(f)
108
- if "url" in (reader.fieldnames or []):
109
- for row in reader:
110
- val = str(row.get("url", "")).strip()
111
- if val:
112
- urls.append(val)
113
- else:
114
- f.seek(0)
115
- f2 = csv.reader(f)
116
- for row in f2:
117
- if not row:
118
- continue
119
- val = str(row[0]).strip()
120
- if val.lower() == "url":
121
- continue
122
- if val:
123
- urls.append(val)
124
- except Exception as e:
125
- print(f"[csv] failed reading URLs from {path}: {e}")
126
- return urls
127
-
128
- def _read_hosts_from_csv(path: str) -> Dict[str, str]:
129
- host_to_label: Dict[str, str] = {}
130
- try:
131
- with open(path, newline="", encoding="utf-8") as f:
132
- reader = csv.DictReader(f)
133
- fields = [x.lower() for x in (reader.fieldnames or [])]
134
- if "host" in fields and "label" in fields:
135
- for row in reader:
136
- host = str(row.get("host", "")).strip().lower()
137
- label = str(row.get("label", "")).strip().upper()
138
- if host and label in ("PHISH", "LEGIT"):
139
- host_to_label[host] = label
140
- else:
141
- f.seek(0)
142
- f2 = csv.reader(f)
143
- for row in f2:
144
- if len(row) < 2:
145
- continue
146
- host = str(row[0]).strip().lower()
147
- label = str(row[1]).strip().upper()
148
- if host.lower() == "host" and label == "LABEL":
149
- continue
150
- if host and label in ("PHISH", "LEGIT"):
151
- host_to_label[host] = label
152
- except Exception as e:
153
- print(f"[csv] failed reading hosts from {path}: {e}")
154
- return host_to_label
155
-
156
- def _load_csv_configs_if_any():
157
- base_dir = os.path.dirname(__file__)
158
- phishy_csv = os.environ.get("AUTOCALIB_PHISHY_CSV", os.path.join(base_dir, "autocalib_phishy.csv"))
159
- legit_csv = os.environ.get("AUTOCALIB_LEGIT_CSV", os.path.join(base_dir, "autocalib_legit.csv"))
160
- hosts_csv = os.environ.get("KNOWN_HOSTS_CSV", os.path.join(base_dir, "known_hosts.csv"))
161
-
162
- if os.path.exists(phishy_csv):
163
- urls = _read_urls_from_csv(phishy_csv)
164
- if urls:
165
- print(f"[csv] loaded phishy URLs: {len(urls)} from {phishy_csv}")
166
- _AUTOCALIB_PHISHY_URLS[:] = urls
167
- if os.path.exists(legit_csv):
168
- urls = _read_urls_from_csv(legit_csv)
169
- if urls:
170
- print(f"[csv] loaded legit URLs: {len(urls)} from {legit_csv}")
171
- _AUTOCALIB_LEGIT_URLS[:] = urls
172
- if os.path.exists(hosts_csv):
173
- mapping = _read_hosts_from_csv(hosts_csv)
174
- if mapping:
175
- print(f"[csv] loaded known hosts: {len(mapping)} from {hosts_csv}")
176
- _KNOWN_LEGIT_HOSTS.clear()
177
- _KNOWN_PHISH_HOSTS.clear()
178
- for host, label in mapping.items():
179
- if label == "LEGIT":
180
- _KNOWN_LEGIT_HOSTS.append(host)
181
- elif label == "PHISH":
182
- _KNOWN_PHISH_HOSTS.append(host)
183
 
184
  # -------------------------
185
  # URL features (must match training)
@@ -279,21 +229,6 @@ def _auto_calibrate_phish_positive(bundle: Dict[str, Any], feature_cols: List[st
279
 
280
  phishy = _AUTOCALIB_PHISHY_URLS
281
  legit = _AUTOCALIB_LEGIT_URLS
282
- # Guard: if CSVs are empty, fall back to safe defaults
283
- if not phishy:
284
- phishy = [
285
- "http://198.51.100.23/login/update?acc=123",
286
- "http://secure-login-account-update.example.com/session?id=123",
287
- "http://bank.verify-update-security.com/confirm",
288
- "http://paypal.com.account-verify.cn/login",
289
- ]
290
- if not legit:
291
- legit = [
292
- "https://www.wikipedia.org/",
293
- "https://www.python.org/",
294
- "https://www.microsoft.com/",
295
- "https://www.openai.com/",
296
- ]
297
 
298
  model = bundle.get("model")
299
  model_type: str = str(bundle.get("model_type") or "")
@@ -359,8 +294,6 @@ def _startup():
359
  print(f"[startup] text model load failed: {e}")
360
  try:
361
  _load_url_model()
362
- # Load CSV-based config if present
363
- _load_csv_configs_if_any()
364
  global _url_phish_is_positive
365
  b = _url_bundle
366
  if isinstance(b, dict) and _url_phish_is_positive is None:
@@ -380,20 +313,6 @@ def _startup():
380
  def root():
381
  return {"status": "ok", "model": MODEL_ID}
382
 
383
- @app.get("/debug-config")
384
- def debug_config():
385
- return {
386
- "phishy_count": len(_AUTOCALIB_PHISHY_URLS),
387
- "legit_count": len(_AUTOCALIB_LEGIT_URLS),
388
- "known_legit_hosts": _KNOWN_LEGIT_HOSTS[:50],
389
- "known_phish_hosts": _KNOWN_PHISH_HOSTS[:50],
390
- "url_repo": URL_REPO,
391
- "url_repo_type": URL_REPO_TYPE,
392
- "url_filename": URL_FILENAME,
393
- "phish_is_positive_env": URL_POSITIVE_CLASS_ENV if URL_POSITIVE_CLASS_ENV else None,
394
- "resolved_phish_is_positive": _url_phish_is_positive,
395
- }
396
-
397
  @app.post("/predict")
398
  def predict(payload: PredictPayload):
399
  try:
@@ -468,9 +387,9 @@ def predict_url(payload: PredictUrlPayload):
468
  host = (urlparse(url_str).hostname or "").lower()
469
  if host:
470
  override_label: Optional[str] = None
471
- if _host_matches_any(host, _KNOWN_LEGIT_HOSTS):
472
  override_label = "LEGIT"
473
- elif _host_matches_any(host, _KNOWN_PHISH_HOSTS):
474
  override_label = "PHISH"
475
  if override_label is not None:
476
  # Map numeric label according to resolved polarity
 
6
  os.environ.setdefault("TORCH_HOME", "/data/.cache")
7
 
8
  from typing import Optional, List, Dict, Any
 
9
  from urllib.parse import urlparse
10
  import threading
11
  import re
 
72
  # -------------------------
73
  # You can edit these lists to define which URLs are considered obviously phishy/legit
74
  # for polarity auto-calibration of classical URL models (e.g., XGBoost, scikit-learn).
75
+ _AUTOCALIB_PHISHY_URLS: List[str] = [
76
+ "http://198.51.100.23/login/update?acc=123",
77
+ "http://secure-login-account-update.example.com/session?id=123",
78
+ "http://bank.verify-update-security.com/confirm",
79
+ "http://paypal.com.account-verify.cn/login",
80
+ "http://abc.xyz/downloads/invoice.exe",
81
+ "http://203.0.113.45/verify/account",
82
+ "http://login-secure-update.example.net/confirm",
83
+ "http://paypal.com.verify.bill.cn/login",
84
+ "http://account-update-security-pay.example.org/verify",
85
+ "http://secure-login-microsoft.example.info/reset",
86
+ "http://login.verify-paypal.support-id.example.com/",
87
+ "http://dropbox.com.security-alert.example.net/login",
88
+ "http://bankofamerica.secure-update.example.co/verify",
89
+ "http://icloud.apple.com.signin.security-alert.example.co/login",
90
+ "http://google.com.accounts.security-check.example.xyz/signin",
91
+ "http://update-billing-info.example-downloads.com/invoice.zip",
92
+ "http://download.secure-update.example.com/app.exe",
93
+ "http://192.0.2.10/secure/login",
94
+ "http://198.51.100.50/account/verify?session=abc",
95
+ "http://example.com@evil.com/login",
96
+ "http://secure.example.com-login.verify.co/reset",
97
+ "http://support-paypal.example.co.uk.refund.cn/login",
98
+ "http://microsoft.account-security.example.ru/update",
99
+ "http://amazon.verify-order.example.top/confirm",
100
+ "http://webscr.paypal.example.phish/login",
101
+ ]
102
+
103
+ _AUTOCALIB_LEGIT_URLS: List[str] = [
104
+ "https://www.wikipedia.org/",
105
+ "https://www.microsoft.com/",
106
+ "https://www.openai.com/",
107
+ "https://www.python.org/",
108
+ "https://www.gov.uk/",
109
+ "https://www.google.com/",
110
+ "https://www.apple.com/",
111
+ "https://github.com/",
112
+ "https://stackoverflow.com/",
113
+ "https://www.bbc.com/",
114
+ "https://www.nytimes.com/",
115
+ "https://www.nasa.gov/",
116
+ "https://www.mozilla.org/",
117
+ "https://www.cloudflare.com/",
118
+ "https://www.reddit.com/",
119
+ "https://www.linkedin.com/",
120
+ "https://www.youtube.com/",
121
+ "https://developer.apple.com/",
122
+ "https://aws.amazon.com/",
123
+ "https://azure.microsoft.com/",
124
+ ]
125
+
126
+ # Known host overrides (editable): force certain domains as LEGIT or PHISH
127
+ _KNOWN_LEGIT_HOSTS: List[str] = [
128
+ "cjplogger.com",
129
+ "www.cjplogger.com",
130
+ ]
131
+ _KNOWN_PHISH_HOSTS: List[str] = [
132
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
  # -------------------------
135
  # URL features (must match training)
 
229
 
230
  phishy = _AUTOCALIB_PHISHY_URLS
231
  legit = _AUTOCALIB_LEGIT_URLS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  model = bundle.get("model")
234
  model_type: str = str(bundle.get("model_type") or "")
 
294
  print(f"[startup] text model load failed: {e}")
295
  try:
296
  _load_url_model()
 
 
297
  global _url_phish_is_positive
298
  b = _url_bundle
299
  if isinstance(b, dict) and _url_phish_is_positive is None:
 
313
  def root():
314
  return {"status": "ok", "model": MODEL_ID}
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  @app.post("/predict")
317
  def predict(payload: PredictPayload):
318
  try:
 
387
  host = (urlparse(url_str).hostname or "").lower()
388
  if host:
389
  override_label: Optional[str] = None
390
+ if host in _KNOWN_LEGIT_HOSTS:
391
  override_label = "LEGIT"
392
+ elif host in _KNOWN_PHISH_HOSTS:
393
  override_label = "PHISH"
394
  if override_label is not None:
395
  # Map numeric label according to resolved polarity