Nayan Ghosh commited on
Commit
4ceaa13
Β·
1 Parent(s): faefaf7
Files changed (3) hide show
  1. app.py +1 -1
  2. auto_retrain.py +11 -13
  3. convert_to_onnx.py +1 -8
app.py CHANGED
@@ -37,7 +37,7 @@ except ImportError:
37
  logging.basicConfig(
38
  level=logging.INFO,
39
  format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
40
- handlers=[logging.StreamHandler()], # stdout only β€” HF Spaces has no persistent log files
41
  )
42
  log = logging.getLogger("phishguard")
43
 
 
37
  logging.basicConfig(
38
  level=logging.INFO,
39
  format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
40
+ handlers=[logging.StreamHandler()],
41
  )
42
  log = logging.getLogger("phishguard")
43
 
auto_retrain.py CHANGED
@@ -5,7 +5,7 @@ import time
5
  import pickle
6
  import numpy as np
7
  from datetime import datetime
8
- from urllib.parse import urlparse # FIX: moved to top β€” was imported inside loops
9
 
10
  # ── Thresholds ────────────────────────────────────────────────────────────────
11
  SAFE_REPORT_THRESHOLD = 3
@@ -13,15 +13,13 @@ PHISHING_REPORT_THRESHOLD = 2
13
  CHECK_INTERVAL_MINUTES = 30
14
  MIN_NEW_REPORTS_TO_RETRAIN = 5
15
 
16
- # New model must achieve at least this accuracy to replace the current one
17
  MIN_ACCURACY_TO_REPLACE = 0.88
18
 
19
  DB = "scans.db"
20
 
21
 
22
- # FIX: Use a shared get_db() that respects Turso when configured,
23
- # instead of always hardcoding sqlite3.connect(DB).
24
- # If app.py defines get_db, we import it; otherwise fall back to local SQLite.
25
  def get_db():
26
  try:
27
  from app import get_db as app_get_db
@@ -95,9 +93,9 @@ def process_new_reports():
95
  print(f"βœ… Verified SAFE ({cnt} reports): {url[:60]}")
96
 
97
  elif label == "phishing" and cnt >= PHISHING_REPORT_THRESHOLD:
98
- # FIX: Also check for conflicting safe reports before verifying as phishing
99
- # Previously this was missing β€” 10 safe reports + 2 phishing would
100
- # still get marked as verified phishing
101
  safe_count = conn.execute(
102
  "SELECT COUNT(*) FROM reports WHERE url=? AND label='safe'", (url,)
103
  ).fetchone()[0]
@@ -213,8 +211,8 @@ def retrain_model(new_verified_urls, reload_callback=None):
213
  flist = features_to_list(feats)
214
  y_val = 0 if label.lower() == "safe" else 1
215
 
216
- # FIX: Balanced multiplier β€” both classes now get 5x weight
217
- # Previously safe=3x, phishing=5x which biased model toward false positives
218
  multiplier = 5
219
  for _ in range(multiplier):
220
  extra_X.append(flist)
@@ -254,7 +252,7 @@ def retrain_model(new_verified_urls, reload_callback=None):
254
  accuracy = accuracy_score(y_te, y_pred)
255
  print(f" New model accuracy: {accuracy * 100:.2f}%")
256
 
257
- # Accuracy gating β€” only replace if new model is good enough
258
  if accuracy < MIN_ACCURACY_TO_REPLACE:
259
  print(f" ⚠️ Accuracy {accuracy:.2%} below threshold {MIN_ACCURACY_TO_REPLACE:.2%}")
260
  print(" Keeping existing model β€” new model not good enough")
@@ -267,7 +265,7 @@ def retrain_model(new_verified_urls, reload_callback=None):
267
  conn.close()
268
  return False, accuracy
269
 
270
- # Backup old model before replacing
271
  if os.path.exists("model.pkl"):
272
  os.rename("model.pkl", "model_backup.pkl")
273
  print(" Old model backed up β†’ model_backup.pkl")
@@ -343,7 +341,7 @@ class AutoRetrainWatcher:
343
  print(f" New verified SAFE domains: {len(new_safe)}")
344
  for url in new_safe:
345
  try:
346
- # FIX: urlparse now imported at top, not re-imported here
347
  domain = urlparse(url).netloc.lower().replace("www.", "")
348
  if domain:
349
  self.dynamic_whitelist.add(domain)
 
5
  import pickle
6
  import numpy as np
7
  from datetime import datetime
8
+ from urllib.parse import urlparse
9
 
10
  # ── Thresholds ────────────────────────────────────────────────────────────────
11
  SAFE_REPORT_THRESHOLD = 3
 
13
  CHECK_INTERVAL_MINUTES = 30
14
  MIN_NEW_REPORTS_TO_RETRAIN = 5
15
 
16
+
17
  MIN_ACCURACY_TO_REPLACE = 0.88
18
 
19
  DB = "scans.db"
20
 
21
 
22
+
 
 
23
  def get_db():
24
  try:
25
  from app import get_db as app_get_db
 
93
  print(f"βœ… Verified SAFE ({cnt} reports): {url[:60]}")
94
 
95
  elif label == "phishing" and cnt >= PHISHING_REPORT_THRESHOLD:
96
+
97
+
98
+
99
  safe_count = conn.execute(
100
  "SELECT COUNT(*) FROM reports WHERE url=? AND label='safe'", (url,)
101
  ).fetchone()[0]
 
211
  flist = features_to_list(feats)
212
  y_val = 0 if label.lower() == "safe" else 1
213
 
214
+
215
+
216
  multiplier = 5
217
  for _ in range(multiplier):
218
  extra_X.append(flist)
 
252
  accuracy = accuracy_score(y_te, y_pred)
253
  print(f" New model accuracy: {accuracy * 100:.2f}%")
254
 
255
+
256
  if accuracy < MIN_ACCURACY_TO_REPLACE:
257
  print(f" ⚠️ Accuracy {accuracy:.2%} below threshold {MIN_ACCURACY_TO_REPLACE:.2%}")
258
  print(" Keeping existing model β€” new model not good enough")
 
265
  conn.close()
266
  return False, accuracy
267
 
268
+
269
  if os.path.exists("model.pkl"):
270
  os.rename("model.pkl", "model_backup.pkl")
271
  print(" Old model backed up β†’ model_backup.pkl")
 
341
  print(f" New verified SAFE domains: {len(new_safe)}")
342
  for url in new_safe:
343
  try:
344
+
345
  domain = urlparse(url).netloc.lower().replace("www.", "")
346
  if domain:
347
  self.dynamic_whitelist.add(domain)
convert_to_onnx.py CHANGED
@@ -1,11 +1,4 @@
1
- # convert_to_onnx.py v2 - Run ONCE locally to convert your PyTorch model to ONNX
2
- # ─────────────────────────────────────────────────────────────────────────────
3
- # Requirements (local only, NOT needed on Render):
4
- # pip install torch onnx onnxruntime
5
- #
6
- # Run: python convert_to_onnx.py
7
- # Output: phishnet.onnx (commit this to your repo)
8
- # ─────────────────────────────────────────────────────────────────────────────
9
 
10
  import os
11
  import pickle
 
1
+
 
 
 
 
 
 
 
2
 
3
  import os
4
  import pickle