Spaces:
Running
Running
Nayan Ghosh commited on
Commit Β·
4ceaa13
1
Parent(s): faefaf7
updated
Browse files- app.py +1 -1
- auto_retrain.py +11 -13
- convert_to_onnx.py +1 -8
app.py
CHANGED
|
@@ -37,7 +37,7 @@ except ImportError:
|
|
| 37 |
logging.basicConfig(
|
| 38 |
level=logging.INFO,
|
| 39 |
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
|
| 40 |
-
handlers=[logging.StreamHandler()],
|
| 41 |
)
|
| 42 |
log = logging.getLogger("phishguard")
|
| 43 |
|
|
|
|
| 37 |
logging.basicConfig(
|
| 38 |
level=logging.INFO,
|
| 39 |
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
|
| 40 |
+
handlers=[logging.StreamHandler()],
|
| 41 |
)
|
| 42 |
log = logging.getLogger("phishguard")
|
| 43 |
|
auto_retrain.py
CHANGED
|
@@ -5,7 +5,7 @@ import time
|
|
| 5 |
import pickle
|
| 6 |
import numpy as np
|
| 7 |
from datetime import datetime
|
| 8 |
-
from urllib.parse import urlparse
|
| 9 |
|
| 10 |
# ββ Thresholds ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
SAFE_REPORT_THRESHOLD = 3
|
|
@@ -13,15 +13,13 @@ PHISHING_REPORT_THRESHOLD = 2
|
|
| 13 |
CHECK_INTERVAL_MINUTES = 30
|
| 14 |
MIN_NEW_REPORTS_TO_RETRAIN = 5
|
| 15 |
|
| 16 |
-
|
| 17 |
MIN_ACCURACY_TO_REPLACE = 0.88
|
| 18 |
|
| 19 |
DB = "scans.db"
|
| 20 |
|
| 21 |
|
| 22 |
-
|
| 23 |
-
# instead of always hardcoding sqlite3.connect(DB).
|
| 24 |
-
# If app.py defines get_db, we import it; otherwise fall back to local SQLite.
|
| 25 |
def get_db():
|
| 26 |
try:
|
| 27 |
from app import get_db as app_get_db
|
|
@@ -95,9 +93,9 @@ def process_new_reports():
|
|
| 95 |
print(f"β
Verified SAFE ({cnt} reports): {url[:60]}")
|
| 96 |
|
| 97 |
elif label == "phishing" and cnt >= PHISHING_REPORT_THRESHOLD:
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
safe_count = conn.execute(
|
| 102 |
"SELECT COUNT(*) FROM reports WHERE url=? AND label='safe'", (url,)
|
| 103 |
).fetchone()[0]
|
|
@@ -213,8 +211,8 @@ def retrain_model(new_verified_urls, reload_callback=None):
|
|
| 213 |
flist = features_to_list(feats)
|
| 214 |
y_val = 0 if label.lower() == "safe" else 1
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
multiplier = 5
|
| 219 |
for _ in range(multiplier):
|
| 220 |
extra_X.append(flist)
|
|
@@ -254,7 +252,7 @@ def retrain_model(new_verified_urls, reload_callback=None):
|
|
| 254 |
accuracy = accuracy_score(y_te, y_pred)
|
| 255 |
print(f" New model accuracy: {accuracy * 100:.2f}%")
|
| 256 |
|
| 257 |
-
|
| 258 |
if accuracy < MIN_ACCURACY_TO_REPLACE:
|
| 259 |
print(f" β οΈ Accuracy {accuracy:.2%} below threshold {MIN_ACCURACY_TO_REPLACE:.2%}")
|
| 260 |
print(" Keeping existing model β new model not good enough")
|
|
@@ -267,7 +265,7 @@ def retrain_model(new_verified_urls, reload_callback=None):
|
|
| 267 |
conn.close()
|
| 268 |
return False, accuracy
|
| 269 |
|
| 270 |
-
|
| 271 |
if os.path.exists("model.pkl"):
|
| 272 |
os.rename("model.pkl", "model_backup.pkl")
|
| 273 |
print(" Old model backed up β model_backup.pkl")
|
|
@@ -343,7 +341,7 @@ class AutoRetrainWatcher:
|
|
| 343 |
print(f" New verified SAFE domains: {len(new_safe)}")
|
| 344 |
for url in new_safe:
|
| 345 |
try:
|
| 346 |
-
|
| 347 |
domain = urlparse(url).netloc.lower().replace("www.", "")
|
| 348 |
if domain:
|
| 349 |
self.dynamic_whitelist.add(domain)
|
|
|
|
| 5 |
import pickle
|
| 6 |
import numpy as np
|
| 7 |
from datetime import datetime
|
| 8 |
+
from urllib.parse import urlparse
|
| 9 |
|
| 10 |
# ββ Thresholds ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
SAFE_REPORT_THRESHOLD = 3
|
|
|
|
| 13 |
CHECK_INTERVAL_MINUTES = 30
|
| 14 |
MIN_NEW_REPORTS_TO_RETRAIN = 5
|
| 15 |
|
| 16 |
+
|
| 17 |
MIN_ACCURACY_TO_REPLACE = 0.88
|
| 18 |
|
| 19 |
DB = "scans.db"
|
| 20 |
|
| 21 |
|
| 22 |
+
|
|
|
|
|
|
|
| 23 |
def get_db():
|
| 24 |
try:
|
| 25 |
from app import get_db as app_get_db
|
|
|
|
| 93 |
print(f"β
Verified SAFE ({cnt} reports): {url[:60]}")
|
| 94 |
|
| 95 |
elif label == "phishing" and cnt >= PHISHING_REPORT_THRESHOLD:
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
safe_count = conn.execute(
|
| 100 |
"SELECT COUNT(*) FROM reports WHERE url=? AND label='safe'", (url,)
|
| 101 |
).fetchone()[0]
|
|
|
|
| 211 |
flist = features_to_list(feats)
|
| 212 |
y_val = 0 if label.lower() == "safe" else 1
|
| 213 |
|
| 214 |
+
|
| 215 |
+
|
| 216 |
multiplier = 5
|
| 217 |
for _ in range(multiplier):
|
| 218 |
extra_X.append(flist)
|
|
|
|
| 252 |
accuracy = accuracy_score(y_te, y_pred)
|
| 253 |
print(f" New model accuracy: {accuracy * 100:.2f}%")
|
| 254 |
|
| 255 |
+
|
| 256 |
if accuracy < MIN_ACCURACY_TO_REPLACE:
|
| 257 |
print(f" β οΈ Accuracy {accuracy:.2%} below threshold {MIN_ACCURACY_TO_REPLACE:.2%}")
|
| 258 |
print(" Keeping existing model β new model not good enough")
|
|
|
|
| 265 |
conn.close()
|
| 266 |
return False, accuracy
|
| 267 |
|
| 268 |
+
|
| 269 |
if os.path.exists("model.pkl"):
|
| 270 |
os.rename("model.pkl", "model_backup.pkl")
|
| 271 |
print(" Old model backed up β model_backup.pkl")
|
|
|
|
| 341 |
print(f" New verified SAFE domains: {len(new_safe)}")
|
| 342 |
for url in new_safe:
|
| 343 |
try:
|
| 344 |
+
|
| 345 |
domain = urlparse(url).netloc.lower().replace("www.", "")
|
| 346 |
if domain:
|
| 347 |
self.dynamic_whitelist.add(domain)
|
convert_to_onnx.py
CHANGED
|
@@ -1,11 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
-
# Requirements (local only, NOT needed on Render):
|
| 4 |
-
# pip install torch onnx onnxruntime
|
| 5 |
-
#
|
| 6 |
-
# Run: python convert_to_onnx.py
|
| 7 |
-
# Output: phishnet.onnx (commit this to your repo)
|
| 8 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 9 |
|
| 10 |
import os
|
| 11 |
import pickle
|
|
|
|
| 1 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import pickle
|