Upload 72 files
Browse files- .gitattributes +36 -36
- Models/Unconfirmed 784952.crdownload +3 -0
- Models/linear_model (1).pkl +3 -0
- Models/logistic_model.pkl +2 -2
- Models/logvectorizer.pkl +3 -0
- README.md +13 -0
- Templates/NB_spam.html +70 -70
- Templates/logistic.html +1 -1
- app.py +653 -356
- load_file.py +17 -6
- requirements.txt +0 -0
- save_token.py +1 -1
- train_logistic_model.py +19 -27
.gitattributes
CHANGED
|
@@ -1,36 +1,36 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Models/Unconfirmed[[:space:]]784952.crdownload filter=lfs diff=lfs merge=lfs -text
|
Models/Unconfirmed 784952.crdownload
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b76814785969081fb542eb90f1adca0b7e08af310da68ab91231c806c4e3d53d
|
| 3 |
+
size 69189991
|
Models/linear_model (1).pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e686db9126ad24dbdd3eaee6b9915cce209e0c703e3279c23787cdb3f1fa6e7a
|
| 3 |
+
size 577
|
Models/logistic_model.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:57c8921a04cc148eb213bc4e1d21bf7d4e027401ea0dbe272567d6d6dd12d920
|
| 3 |
+
size 40863
|
Models/logvectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e51b1d8b6c8975d5469c9c7540af43fab5ac2bdce0008d7109cfdab4fd481917
|
| 3 |
+
size 160142
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Neroml
|
| 3 |
+
emoji: 📉
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.43.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
short_description: IT is a web page that teach ml algorithm with visualisation
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
Templates/NB_spam.html
CHANGED
|
@@ -72,74 +72,74 @@
|
|
| 72 |
</style>
|
| 73 |
</head>
|
| 74 |
<body>
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
<
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
</
|
| 144 |
</body>
|
| 145 |
-
</html>
|
|
|
|
| 72 |
</style>
|
| 73 |
</head>
|
| 74 |
<body>
|
| 75 |
+
<title>Naive Bayes URL Spam Checker</title>
|
| 76 |
+
</head>
|
| 77 |
+
<body>
|
| 78 |
+
<h1>🔍 Naive Bayes URL Spam Checker</h1>
|
| 79 |
+
|
| 80 |
+
<input type="text" id="urlInput" placeholder="Enter URL (e.g. http://example.com)">
|
| 81 |
+
<br>
|
| 82 |
+
<button onclick="checkURL()">Check</button>
|
| 83 |
+
|
| 84 |
+
<div id="result"></div>
|
| 85 |
+
<div id="spellSteps"></div>
|
| 86 |
+
<div id="reason"></div>
|
| 87 |
+
|
| 88 |
+
<script>
|
| 89 |
+
async function checkURL() {
|
| 90 |
+
const url = document.getElementById("urlInput").value.trim();
|
| 91 |
+
const resultDiv = document.getElementById("result");
|
| 92 |
+
const reasonDiv = document.getElementById("reason");
|
| 93 |
+
const spellStepsDiv = document.getElementById("spellSteps");
|
| 94 |
+
|
| 95 |
+
resultDiv.innerHTML = "⏳ Checking...";
|
| 96 |
+
reasonDiv.innerHTML = "";
|
| 97 |
+
spellStepsDiv.innerHTML = "";
|
| 98 |
+
|
| 99 |
+
try {
|
| 100 |
+
const response = await fetch('/predict', {
|
| 101 |
+
method: 'POST',
|
| 102 |
+
headers: { 'Content-Type': 'application/json' },
|
| 103 |
+
body: JSON.stringify({ url: url })
|
| 104 |
+
});
|
| 105 |
+
|
| 106 |
+
const data = await response.json();
|
| 107 |
+
|
| 108 |
+
if (data.prediction === 1) {
|
| 109 |
+
resultDiv.innerHTML = "🚫 <span class='spam'>SPAM / PHISHING</span>";
|
| 110 |
+
} else {
|
| 111 |
+
resultDiv.innerHTML = "✅ <span class='safe'>This URL is SAFE</span>";
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
if (data.reason) {
|
| 115 |
+
reasonDiv.innerText = `🔍 Reason: ${data.reason}`;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
if (data.steps && data.steps.length > 0) {
|
| 119 |
+
const title = document.createElement("h3");
|
| 120 |
+
title.innerText = "🧠 Spell Checker Log:";
|
| 121 |
+
spellStepsDiv.appendChild(title);
|
| 122 |
+
|
| 123 |
+
data.steps.forEach((step) => {
|
| 124 |
+
const line = document.createElement("div");
|
| 125 |
+
line.innerHTML = step.valid
|
| 126 |
+
? `✅ ${step.word} → Valid`
|
| 127 |
+
: `❌ ${step.word} → Misspelled`;
|
| 128 |
+
line.style.color = step.valid ? "green" : "red";
|
| 129 |
+
spellStepsDiv.appendChild(line);
|
| 130 |
+
});
|
| 131 |
+
}
|
| 132 |
+
} catch (err) {
|
| 133 |
+
resultDiv.innerHTML = "⚠️ Error checking the URL.";
|
| 134 |
+
reasonDiv.innerText = err.message;
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
</script>
|
| 138 |
+
|
| 139 |
+
<div class="mt-6 text-center">
|
| 140 |
+
<a href="/naive_bayes" class="inline-block bg-gray-200 hover:bg-gray-300 text-gray-800 px-4 py-2 rounded shadow">
|
| 141 |
+
← Back to Naive Bayes classification
|
| 142 |
+
</a>
|
| 143 |
+
</div>
|
| 144 |
</body>
|
| 145 |
+
</html>
|
Templates/logistic.html
CHANGED
|
@@ -58,7 +58,7 @@
|
|
| 58 |
<ul class="space-y-1 font-mono text-xs">
|
| 59 |
<li><strong>Cleaned Text:</strong> {{ cleaned }}</li>
|
| 60 |
<li><strong>Tokenized:</strong> {{ tokens }}</li>
|
| 61 |
-
|
| 62 |
<li><strong>Sigmoid Output:</strong> {{ probability }}</li>
|
| 63 |
<li><strong>Final Prediction:</strong> {{ prediction }}</li>
|
| 64 |
</ul>
|
|
|
|
| 58 |
<ul class="space-y-1 font-mono text-xs">
|
| 59 |
<li><strong>Cleaned Text:</strong> {{ cleaned }}</li>
|
| 60 |
<li><strong>Tokenized:</strong> {{ tokens }}</li>
|
| 61 |
+
|
| 62 |
<li><strong>Sigmoid Output:</strong> {{ probability }}</li>
|
| 63 |
<li><strong>Final Prediction:</strong> {{ prediction }}</li>
|
| 64 |
</ul>
|
app.py
CHANGED
|
@@ -38,6 +38,18 @@ from dotenv import load_dotenv
|
|
| 38 |
import os
|
| 39 |
from urllib.parse import urlparse
|
| 40 |
import tldextract
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# Load environment variables from .env
|
| 42 |
load_dotenv()
|
| 43 |
#spam url import relateted
|
|
@@ -83,59 +95,59 @@ import google.generativeai as genai
|
|
| 83 |
|
| 84 |
|
| 85 |
#huggung face code start
|
| 86 |
-
from huggingface_hub import hf_hub_download
|
| 87 |
-
import joblib
|
| 88 |
-
import numpy as np
|
| 89 |
-
import torch
|
| 90 |
-
|
| 91 |
-
REPO_ID = "deedrop1140/my-ml-models"
|
| 92 |
-
|
| 93 |
-
def load_file(filename):
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
# =====================
|
| 107 |
-
# Replace your old model loads with this:
|
| 108 |
-
# =====================
|
| 109 |
-
|
| 110 |
-
# Models
|
| 111 |
-
knn_model = load_file("Models/knn_model.pkl")
|
| 112 |
-
lasso_model = load_file("Models/lasso_model.pkl")
|
| 113 |
-
liar_model = load_file("Models/liar_model.joblib")
|
| 114 |
-
linear_model = load_file("Models/linear_model.pkl")
|
| 115 |
-
logistic_model = load_file("Models/logistic_model.pkl")
|
| 116 |
-
nb_url_model = load_file("Models/nb_url_model.pkl")
|
| 117 |
-
poly_model = load_file("Models/poly_model.pkl")
|
| 118 |
-
rf_model = load_file("Models/rf_model.pkl")
|
| 119 |
-
ridge_model = load_file("Models/ridge_model.pkl")
|
| 120 |
-
supervised_model = load_file("Models/supervised_model.pkl")
|
| 121 |
-
svr_model = load_file("Models/svr_model.pkl")
|
| 122 |
-
voting_url_model = load_file("Models/voting_url_model.pkl")
|
| 123 |
-
|
| 124 |
-
# Vectorizers / Encoders / Scalers
|
| 125 |
-
label_classes = load_file("Models/label_classes.npy")
|
| 126 |
-
label_encoder = load_file("Models/label_encoder.pkl")
|
| 127 |
-
lasso_scaler = load_file("Models/lasso_scaler.pkl")
|
| 128 |
-
liar_vectorizer = load_file("Models/liar_vectorizer.joblib")
|
| 129 |
-
nb_url_vectorizer = load_file("Models/nb_url_vectorizer.pkl")
|
| 130 |
-
poly_transform = load_file("Models/poly_transform.pkl")
|
| 131 |
-
ridge_scaler = load_file("Models/ridge_scaler.pkl")
|
| 132 |
-
svr_scaler_X = load_file("Models/svr_scaler_X.pkl")
|
| 133 |
-
svr_scaler_y = load_file("Models/svr_scaler_y.pkl")
|
| 134 |
-
tfidf_vectorizer = load_file("Models/tfidf_vectorizer.pkl")
|
| 135 |
-
url_vectorizer = load_file("Models/url_vectorizer.pkl")
|
| 136 |
-
vectorizer_joblib = load_file("Models/vectorizer.joblib")
|
| 137 |
-
vectorizer_pkl = load_file("Models/vectorizer.pkl")
|
| 138 |
-
# huggung face code end
|
| 139 |
|
| 140 |
MODEL_DIR = "Models"
|
| 141 |
DATA_DIR = "housedata" # Assuming your house data is here
|
|
@@ -155,7 +167,7 @@ def ask_gemini(statement):
|
|
| 155 |
return response.text
|
| 156 |
|
| 157 |
#rfc
|
| 158 |
-
model = load("Models/liar_model.joblib")
|
| 159 |
vectorizer = load("Models/liar_vectorizer.joblib")
|
| 160 |
|
| 161 |
# Load BERT fact-checker pipeline (local model)
|
|
@@ -237,16 +249,27 @@ def get_house_data():
|
|
| 237 |
loaded_models = {}
|
| 238 |
|
| 239 |
# Load logistic model and vectorizer for SMS
|
| 240 |
-
vectorizer = joblib.load("Models/
|
| 241 |
model = joblib.load("Models/logistic_model.pkl")
|
| 242 |
|
|
|
|
|
|
|
| 243 |
# Load models once NB+DT+SVM is trained
|
| 244 |
-
|
| 245 |
-
vectorizer = joblib.load("Models/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
#END NB+DT+SVM
|
| 247 |
|
| 248 |
# === Naive Bayes URL Spam Classifier (NB_spam.html) ===
|
| 249 |
# === Load Model & Vectorizer ===
|
|
|
|
|
|
|
|
|
|
| 250 |
VT_API_KEY = os.getenv("VT_API_KEY")
|
| 251 |
|
| 252 |
model_path = os.path.join("Models", "nb_url_model.pkl")
|
|
@@ -266,236 +289,412 @@ else:
|
|
| 266 |
|
| 267 |
|
| 268 |
# Load dictionary words
|
| 269 |
-
valid_words = set(words.words())
|
| 270 |
|
| 271 |
-
def load_trusted_keywords(file_path):
|
| 272 |
-
|
| 273 |
-
|
| 274 |
|
| 275 |
-
# Load trusted colleges from file
|
| 276 |
-
with open("data/trusted_colleges.txt", "r") as f:
|
| 277 |
-
|
| 278 |
|
| 279 |
|
| 280 |
|
| 281 |
-
whitelist = set([
|
| 282 |
-
|
| 283 |
-
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
|
| 322 |
-
|
| 323 |
-
|
| 324 |
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
])
|
| 329 |
|
| 330 |
|
| 331 |
-
def is_gibberish_word(word):
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
|
| 339 |
-
def is_rule_based_spam(url):
|
| 340 |
-
|
| 341 |
-
|
| 342 |
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
|
| 359 |
-
|
| 360 |
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
if any(college in domain for college in trusted_colleges):
|
| 383 |
-
print("✅ Passed Rule 2.5: Trusted college name matched")
|
| 384 |
-
return False, "✅ Trusted college"
|
| 385 |
-
else:
|
| 386 |
-
print("✅ Passed Rule 2.5: No trusted college matched (continue checking)")
|
| 387 |
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
|
| 455 |
-
'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
|
| 456 |
-
'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
|
| 457 |
-
'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
|
| 458 |
-
]
|
| 459 |
-
full_url_parts = url + path + query + fragment
|
| 460 |
-
if any(keyword in full_url_parts for keyword in phishing_keywords):
|
| 461 |
-
print("❌ Failed Rule 11: Contains phishing keyword")
|
| 462 |
-
return True, "🔍 Contains phishing keyword"
|
| 463 |
-
else:
|
| 464 |
-
print("✅ Passed Rule 11: No phishing keywords found")
|
| 465 |
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
return True, "📜 Very long URL path"
|
| 469 |
-
else:
|
| 470 |
-
print("✅ Passed Rule 12: Path length is acceptable")
|
| 471 |
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
return True, "📁 Suspicious file extension in path"
|
| 476 |
-
else:
|
| 477 |
-
print("✅ Passed Rule 13: No suspicious file extension")
|
| 478 |
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
# Gibberish Check
|
| 486 |
-
parts = re.split(r'[\/\.\-\_\?\=\&]', url)
|
| 487 |
-
long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
|
| 488 |
-
gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
|
| 489 |
-
if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
|
| 490 |
-
print("❌ Failed Rule 15: Mostly gibberish words")
|
| 491 |
-
return True, "🧾 Mostly gibberish / non-dictionary words"
|
| 492 |
-
else:
|
| 493 |
-
print("✅ Passed Rule 15: Words are mostly valid")
|
| 494 |
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
|
| 501 |
|
|
@@ -800,43 +999,58 @@ def run_svr_demo():
|
|
| 800 |
|
| 801 |
def clean_text(text):
|
| 802 |
return text.lower().strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
|
| 804 |
@app.route('/logistic', methods=['GET', 'POST'])
|
| 805 |
def logistic():
|
| 806 |
-
prediction = None
|
| 807 |
-
confidence_percentage = None
|
| 808 |
-
cleaned = None
|
| 809 |
-
tokens = None
|
| 810 |
-
vector = None
|
| 811 |
-
probability = None
|
| 812 |
|
| 813 |
if request.method == "POST":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
try:
|
| 815 |
-
msg = request.form.get('message', '')
|
| 816 |
-
cleaned = clean_text(msg)
|
| 817 |
vector = vectorizer.transform([cleaned])
|
| 818 |
probability = model.predict_proba(vector)[0][1]
|
| 819 |
prediction = "Spam" if probability >= 0.5 else "Not Spam"
|
| 820 |
confidence_percentage = round(probability * 100, 2)
|
| 821 |
except Exception as e:
|
| 822 |
-
print("Error
|
| 823 |
prediction = "Error"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 824 |
|
| 825 |
-
return render_template("logistic.html",
|
| 826 |
-
prediction=prediction,
|
| 827 |
-
confidence_percentage=confidence_percentage,
|
| 828 |
-
cleaned=cleaned,
|
| 829 |
-
tokens=cleaned.split() if cleaned else [],
|
| 830 |
-
vector=vector.toarray().tolist() if vector is not None else [],
|
| 831 |
-
probability=round(probability, 4) if probability else None,
|
| 832 |
-
source="form")
|
| 833 |
-
|
| 834 |
@app.route('/logistic-sms', methods=['POST'])
|
| 835 |
def logistic_sms():
|
| 836 |
try:
|
| 837 |
data = request.get_json()
|
| 838 |
msg = data.get('message', '')
|
| 839 |
cleaned = clean_text(msg)
|
|
|
|
|
|
|
| 840 |
vector = vectorizer.transform([cleaned])
|
| 841 |
probability = model.predict_proba(vector)[0][1]
|
| 842 |
prediction = "Spam" if probability >= 0.5 else "Not Spam"
|
|
@@ -847,8 +1061,7 @@ def logistic_sms():
|
|
| 847 |
"confidence": confidence_percentage,
|
| 848 |
"probability": round(probability, 4),
|
| 849 |
"cleaned": cleaned,
|
| 850 |
-
"tokens":
|
| 851 |
-
"vector": vector.toarray().tolist(),
|
| 852 |
"source": "json"
|
| 853 |
})
|
| 854 |
|
|
@@ -1247,112 +1460,196 @@ def dt_visual_predict():
|
|
| 1247 |
|
| 1248 |
# --- Naive Bayes Routes ---
|
| 1249 |
|
|
|
|
|
|
|
|
|
|
| 1250 |
|
|
|
|
|
|
|
| 1251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1252 |
|
| 1253 |
|
| 1254 |
@app.route('/nb_spam')
|
| 1255 |
def nb_spam_page():
|
| 1256 |
return render_template('NB_spam.html')
|
| 1257 |
|
| 1258 |
-
@app.route("/predict", methods=["POST"])
|
| 1259 |
-
def predict():
|
| 1260 |
-
try:
|
| 1261 |
-
import re
|
| 1262 |
-
from urllib.parse import urlparse
|
| 1263 |
-
from spellchecker import SpellChecker
|
| 1264 |
|
| 1265 |
-
|
|
|
|
|
|
|
|
|
|
| 1266 |
|
| 1267 |
-
data = request.get_json()
|
| 1268 |
-
url = data.get("url")
|
| 1269 |
|
| 1270 |
-
if not url:
|
| 1271 |
-
print("❌ No URL provided in request")
|
| 1272 |
-
return jsonify({'error': 'No URL provided'}), 400
|
| 1273 |
|
| 1274 |
-
|
|
|
|
|
|
|
|
|
|
| 1275 |
|
| 1276 |
-
|
| 1277 |
-
|
| 1278 |
-
|
| 1279 |
-
print(f"☣️ VirusTotal flagged it as malicious: {vt_reason}")
|
| 1280 |
-
return jsonify({'prediction': 1, 'reason': vt_reason})
|
| 1281 |
|
| 1282 |
-
|
|
|
|
|
|
|
| 1283 |
|
| 1284 |
-
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
print(f"📛 Rule-based detection triggered: {rule_reason}")
|
| 1288 |
-
return jsonify({'prediction': 1, 'reason': rule_reason})
|
| 1289 |
|
| 1290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1291 |
|
| 1292 |
-
|
| 1293 |
-
|
| 1294 |
-
|
| 1295 |
|
| 1296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1297 |
|
| 1298 |
-
#
|
| 1299 |
spell = SpellChecker(distance=1)
|
| 1300 |
|
| 1301 |
-
#
|
| 1302 |
-
|
| 1303 |
-
|
| 1304 |
-
|
| 1305 |
-
|
| 1306 |
-
|
| 1307 |
-
|
| 1308 |
-
|
| 1309 |
-
|
| 1310 |
-
|
| 1311 |
-
final_words = []
|
| 1312 |
-
final_log = []
|
| 1313 |
-
|
| 1314 |
-
for word in parts:
|
| 1315 |
-
if len(word) > 3 and word.isalpha():
|
| 1316 |
-
split_words = wordninja.split(word.lower())
|
| 1317 |
-
|
| 1318 |
-
if len(split_words) <= 1:
|
| 1319 |
-
split_words = [word.lower()]
|
| 1320 |
-
|
| 1321 |
-
for w in split_words:
|
| 1322 |
-
if len(w) > 2 and w not in seen:
|
| 1323 |
-
seen.add(w)
|
| 1324 |
-
final_words.append(w)
|
| 1325 |
-
final_log.append({
|
| 1326 |
-
"word": w,
|
| 1327 |
-
"valid": w in dictionary_words
|
| 1328 |
-
})
|
| 1329 |
-
|
| 1330 |
-
return final_words, final_log
|
| 1331 |
-
|
| 1332 |
-
# Run extraction and get spelling log
|
| 1333 |
-
words, spell_log = extract_words(url, dictionary_words)
|
| 1334 |
-
misspelled = [entry["word"] for entry in spell_log if not entry["valid"]]
|
| 1335 |
-
|
| 1336 |
-
# If ML says safe but spell check has typos → override
|
| 1337 |
-
if prediction == 0 and misspelled:
|
| 1338 |
-
print("⚠️ Spelling Mismatch: CSV said Safe, but typos found:", misspelled)
|
| 1339 |
return jsonify({
|
| 1340 |
-
|
| 1341 |
-
|
| 1342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1343 |
})
|
| 1344 |
-
|
| 1345 |
-
# ✅ Final Safe/Spam Decision
|
| 1346 |
-
return jsonify({
|
| 1347 |
-
'prediction': int(prediction),
|
| 1348 |
-
'reason': "✅ Passed all checks" if prediction == 0 else "🧾 ML model flagged it",
|
| 1349 |
-
'steps': spell_log
|
| 1350 |
-
})
|
| 1351 |
|
| 1352 |
except Exception as e:
|
| 1353 |
-
|
| 1354 |
-
return jsonify({'error': str(e)}), 500
|
| 1355 |
-
|
| 1356 |
|
| 1357 |
|
| 1358 |
|
|
@@ -1550,4 +1847,4 @@ def DBSCAN():
|
|
| 1550 |
|
| 1551 |
if __name__ == '__main__':
|
| 1552 |
#app.run(debug=True, port=5000)
|
| 1553 |
-
app.run(debug=True)
|
|
|
|
| 38 |
import os
|
| 39 |
from urllib.parse import urlparse
|
| 40 |
import tldextract
|
| 41 |
+
import string
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 45 |
+
|
| 46 |
+
model_name = "microsoft/deberta-v3-small"
|
| 47 |
+
|
| 48 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
| 49 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 50 |
+
|
| 51 |
+
bert_checker = pipeline("text-classification", model=model, tokenizer=tokenizer)
|
| 52 |
+
|
| 53 |
# Load environment variables from .env
|
| 54 |
load_dotenv()
|
| 55 |
#spam url import relateted
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
#huggung face code start
|
| 98 |
+
# from huggingface_hub import hf_hub_download
|
| 99 |
+
# import joblib
|
| 100 |
+
# import numpy as np
|
| 101 |
+
# import torch
|
| 102 |
+
|
| 103 |
+
# REPO_ID = "deedrop1140/my-ml-models"
|
| 104 |
+
|
| 105 |
+
# def load_file(filename):
|
| 106 |
+
# """Download a file from Hugging Face Hub and load it with the right library."""
|
| 107 |
+
# file_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
|
| 108 |
+
|
| 109 |
+
# if filename.endswith(".pkl") or filename.endswith(".joblib"):
|
| 110 |
+
# return joblib.load(file_path)
|
| 111 |
+
# elif filename.endswith(".npy"):
|
| 112 |
+
# return np.load(file_path, allow_pickle=True)
|
| 113 |
+
# elif filename.endswith(".pt") or filename.endswith(".pth"):
|
| 114 |
+
# return torch.load(file_path)
|
| 115 |
+
# else:
|
| 116 |
+
# return file_path
|
| 117 |
+
|
| 118 |
+
# # =====================
|
| 119 |
+
# # Replace your old model loads with this:
|
| 120 |
+
# # =====================
|
| 121 |
+
|
| 122 |
+
# # Models
|
| 123 |
+
# knn_model = load_file("Models/knn_model.pkl")
|
| 124 |
+
# lasso_model = load_file("Models/lasso_model.pkl")
|
| 125 |
+
# liar_model = load_file("Models/liar_model.joblib")
|
| 126 |
+
# linear_model = load_file("Models/linear_model.pkl")
|
| 127 |
+
# logistic_model = load_file("Models/logistic_model.pkl")
|
| 128 |
+
# nb_url_model = load_file("Models/nb_url_model.pkl")
|
| 129 |
+
# poly_model = load_file("Models/poly_model.pkl")
|
| 130 |
+
# rf_model = load_file("Models/rf_model.pkl")
|
| 131 |
+
# ridge_model = load_file("Models/ridge_model.pkl")
|
| 132 |
+
# supervised_model = load_file("Models/supervised_model.pkl")
|
| 133 |
+
# svr_model = load_file("Models/svr_model.pkl")
|
| 134 |
+
# voting_url_model = load_file("Models/voting_url_model.pkl")
|
| 135 |
+
|
| 136 |
+
# # Vectorizers / Encoders / Scalers
|
| 137 |
+
# label_classes = load_file("Models/label_classes.npy")
|
| 138 |
+
# label_encoder = load_file("Models/label_encoder.pkl")
|
| 139 |
+
# lasso_scaler = load_file("Models/lasso_scaler.pkl")
|
| 140 |
+
# liar_vectorizer = load_file("Models/liar_vectorizer.joblib")
|
| 141 |
+
# nb_url_vectorizer = load_file("Models/nb_url_vectorizer.pkl")
|
| 142 |
+
# poly_transform = load_file("Models/poly_transform.pkl")
|
| 143 |
+
# ridge_scaler = load_file("Models/ridge_scaler.pkl")
|
| 144 |
+
# svr_scaler_X = load_file("Models/svr_scaler_X.pkl")
|
| 145 |
+
# svr_scaler_y = load_file("Models/svr_scaler_y.pkl")
|
| 146 |
+
# tfidf_vectorizer = load_file("Models/tfidf_vectorizer.pkl")
|
| 147 |
+
# url_vectorizer = load_file("Models/url_vectorizer.pkl")
|
| 148 |
+
# vectorizer_joblib = load_file("Models/vectorizer.joblib")
|
| 149 |
+
# vectorizer_pkl = load_file("Models/vectorizer.pkl")
|
| 150 |
+
# # huggung face code end
|
| 151 |
|
| 152 |
MODEL_DIR = "Models"
|
| 153 |
DATA_DIR = "housedata" # Assuming your house data is here
|
|
|
|
| 167 |
return response.text
|
| 168 |
|
| 169 |
#rfc
|
| 170 |
+
# model = load("Models/liar_model.joblib")
|
| 171 |
vectorizer = load("Models/liar_vectorizer.joblib")
|
| 172 |
|
| 173 |
# Load BERT fact-checker pipeline (local model)
|
|
|
|
| 249 |
loaded_models = {}
|
| 250 |
|
| 251 |
# Load logistic model and vectorizer for SMS
|
| 252 |
+
vectorizer = joblib.load("Models/logvectorizer.pkl")
|
| 253 |
model = joblib.load("Models/logistic_model.pkl")
|
| 254 |
|
| 255 |
+
|
| 256 |
+
|
| 257 |
# Load models once NB+DT+SVM is trained
|
| 258 |
+
try:
|
| 259 |
+
vectorizer = joblib.load("Models/logvectorizer.pkl")
|
| 260 |
+
model = joblib.load("Models/logistic_model.pkl")
|
| 261 |
+
print("✅ Model and vectorizer loaded into memory successfully!")
|
| 262 |
+
except Exception as e:
|
| 263 |
+
vectorizer = None
|
| 264 |
+
model = None
|
| 265 |
+
print(f"❌ Error: Could not load model or vectorizer. Please check your file paths. Error: {e}")
|
| 266 |
#END NB+DT+SVM
|
| 267 |
|
| 268 |
# === Naive Bayes URL Spam Classifier (NB_spam.html) ===
|
| 269 |
# === Load Model & Vectorizer ===
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
|
| 273 |
VT_API_KEY = os.getenv("VT_API_KEY")
|
| 274 |
|
| 275 |
model_path = os.path.join("Models", "nb_url_model.pkl")
|
|
|
|
| 289 |
|
| 290 |
|
| 291 |
# Load dictionary words
|
| 292 |
+
# valid_words = set(words.words())
|
| 293 |
|
| 294 |
+
# def load_trusted_keywords(file_path):
|
| 295 |
+
# with open(file_path, 'r', encoding='utf-8') as f:
|
| 296 |
+
# return set(line.strip().lower() for line in f if line.strip())
|
| 297 |
|
| 298 |
+
# # # Load trusted colleges from file
|
| 299 |
+
# # with open("data/trusted_colleges.txt", "r") as f:
|
| 300 |
+
# # trusted_colleges = set(line.strip().lower() for line in f if line.strip())
|
| 301 |
|
| 302 |
|
| 303 |
|
| 304 |
+
# whitelist = set([
|
| 305 |
+
# # Search Engines
|
| 306 |
+
# 'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
|
| 307 |
|
| 308 |
+
# # Social Media
|
| 309 |
+
# 'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
|
| 310 |
+
# 'threads', 'pinterest', 'reddit', 'quora',
|
| 311 |
|
| 312 |
+
# # Communication Tools
|
| 313 |
+
# 'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
|
| 314 |
+
# 'teams', 'signal', 'messenger',
|
| 315 |
|
| 316 |
+
# # Global E-commerce
|
| 317 |
+
# 'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
|
| 318 |
+
# 'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
|
| 319 |
|
| 320 |
+
# # Indian E-commerce / Services
|
| 321 |
+
# 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
|
| 322 |
+
# 'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
|
| 323 |
+
# 'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit','https://universalcollegeofengineering.edu.in',
|
| 324 |
|
| 325 |
+
# # Education / Productivity
|
| 326 |
+
# 'youtube', 'docs', 'drive', 'calendar', 'photos', 'zoom',
|
| 327 |
+
# 'gmail', 'notion', 'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
|
| 328 |
|
| 329 |
+
# # News / Media / Tech
|
| 330 |
+
# 'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
|
| 331 |
+
# 'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
|
| 332 |
+
# 'techcrunch', 'verge', 'wired',
|
| 333 |
|
| 334 |
+
# # Streaming / Entertainment
|
| 335 |
+
# 'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
|
| 336 |
|
| 337 |
+
# # Dev & Tools
|
| 338 |
+
# 'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
|
| 339 |
+
# 'adobe', 'figma', 'canva',
|
| 340 |
|
| 341 |
+
# # Financial / Banking
|
| 342 |
+
# 'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
|
| 343 |
+
# 'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
|
| 344 |
|
| 345 |
+
# # Government / Utilities
|
| 346 |
+
# 'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
|
| 347 |
|
| 348 |
+
# # Others Common
|
| 349 |
+
# 'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
|
| 350 |
+
# 'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
|
| 351 |
+
# ])
|
| 352 |
|
| 353 |
|
| 354 |
+
# def is_gibberish_word(word):
|
| 355 |
+
# word = word.lower()
|
| 356 |
+
# if len(word) < 4:
|
| 357 |
+
# return False
|
| 358 |
+
# if not word.isalpha():
|
| 359 |
+
# return True
|
| 360 |
+
# return word not in valid_words
|
| 361 |
|
| 362 |
+
# def is_rule_based_spam(url):
|
| 363 |
+
# url = url.strip().lower()
|
| 364 |
+
# print(f"\n🌐 Checking URL: {url}")
|
| 365 |
|
| 366 |
+
# try:
|
| 367 |
+
# parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
|
| 368 |
+
# domain = parsed.netloc
|
| 369 |
+
# path = parsed.path
|
| 370 |
+
# query = parsed.query
|
| 371 |
+
# fragment = parsed.fragment
|
| 372 |
+
# except Exception as e:
|
| 373 |
+
# print("❌ Failed: Malformed URL")
|
| 374 |
+
# return True, f"❌ Malformed URL: {e}"
|
| 375 |
|
| 376 |
+
# if not domain:
|
| 377 |
+
# print("❌ Failed: Empty domain after parsing")
|
| 378 |
+
# return True, "❌ Empty domain after parsing"
|
| 379 |
+
# else:
|
| 380 |
+
# print("✅ Parsed domain:", domain)
|
| 381 |
|
| 382 |
+
# # --- Rules ---
|
| 383 |
|
| 384 |
+
# if '.' not in domain:
|
| 385 |
+
# print("❌ Failed Rule 1: Domain missing dot (.)")
|
| 386 |
+
# return True, "❌ Domain missing dot (.)"
|
| 387 |
+
# else:
|
| 388 |
+
# print("✅ Passed Rule 1: Domain contains dot")
|
| 389 |
|
| 390 |
+
# trusted_tlds = ['.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int', '.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in']
|
| 391 |
+
# if any(tld in domain for tld in trusted_tlds):
|
| 392 |
+
# print("✅ Passed Rule 2: Trusted TLD")
|
| 393 |
+
# else:
|
| 394 |
+
# print("✅ Passed Rule 2: Not a trusted TLD (but not blocked yet)")
|
| 395 |
|
| 396 |
+
# try:
|
| 397 |
+
# ext = tldextract.extract(url)
|
| 398 |
+
# domain_name = ext.domain
|
| 399 |
+
# suffix = ext.suffix
|
| 400 |
+
# print(f"✅ Extracted domain name: {domain_name}, suffix: {suffix}")
|
| 401 |
+
# except Exception:
|
| 402 |
+
# print("❌ Failed: Cannot extract domain/suffix")
|
| 403 |
+
# return True, "❌ Cannot extract domain/suffix"
|
| 404 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
+
# if domain_name in whitelist:
|
| 407 |
+
# print("✅ Skipping gibberish check for whitelisted domain")
|
| 408 |
+
# else:
|
| 409 |
+
# parts = re.split(r'[\/\.\-\_\?\=\&]', url)
|
| 410 |
+
# long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
|
| 411 |
+
# gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
|
| 412 |
|
| 413 |
+
# if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
|
| 414 |
+
# print("❌ Failed Rule 15: Mostly gibberish words")
|
| 415 |
+
# return True, "🧾 Mostly gibberish / non-dictionary words"
|
| 416 |
+
# else:
|
| 417 |
+
# print("✅ Passed Rule 15: Words are mostly valid")
|
| 418 |
+
|
| 419 |
+
# if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain):
|
| 420 |
+
# print("❌ Failed Rule 3: IP address as domain")
|
| 421 |
+
# return True, "📟 IP address instead of domain"
|
| 422 |
+
# else:
|
| 423 |
+
# print("✅ Passed Rule 3: Domain is not an IP address")
|
| 424 |
+
|
| 425 |
+
# bad_tlds = ['.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn', '.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science', '.stream', '.webcam', '.online', '.site', '.website', '.space', '.club', '.buzz', '.info']
|
| 426 |
+
# if any(suffix == tld.lstrip('.') for tld in bad_tlds):
|
| 427 |
+
# print(f"❌ Failed Rule 4: Suspicious TLD ({suffix})")
|
| 428 |
+
# return True, "🧨 Suspicious top-level domain"
|
| 429 |
+
# else:
|
| 430 |
+
# print("✅ Passed Rule 4: TLD not in suspicious list")
|
| 431 |
+
|
| 432 |
+
# if len(domain_name) > 30:
|
| 433 |
+
# print("❌ Failed Rule 5: Very long domain name")
|
| 434 |
+
# return True, "📏 Very long and unrecognized domain name"
|
| 435 |
+
# else:
|
| 436 |
+
# print("✅ Passed Rule 5: Domain name length is acceptable")
|
| 437 |
+
|
| 438 |
+
# numeric_chars = sum(c.isdigit() for c in domain_name)
|
| 439 |
+
# if len(domain_name) > 5 and (numeric_chars / len(domain_name)) > 0.5:
|
| 440 |
+
# print("❌ Failed Rule 6: Numeric-heavy domain")
|
| 441 |
+
# return True, "🔢 Numeric-heavy domain name"
|
| 442 |
+
# else:
|
| 443 |
+
# print("✅ Passed Rule 6: Domain has few or no digits")
|
| 444 |
+
|
| 445 |
+
# if domain_name.count('-') > 3 or re.search(r'[!@#$%^&*()_+={}\[\]|\\:;"\'<>,?/`~]', domain_name):
|
| 446 |
+
# print("❌ Failed Rule 7: Too many special characters")
|
| 447 |
+
# return True, "➖ Excessive hyphens or special characters in domain"
|
| 448 |
+
# else:
|
| 449 |
+
# print("✅ Passed Rule 7: No excessive special characters")
|
| 450 |
+
|
| 451 |
+
# if domain_name.startswith('xn--'):
|
| 452 |
+
# print("❌ Failed Rule 8: Punycode detected")
|
| 453 |
+
# return True, "🌐 Punycode detected (potential homograph attack)"
|
| 454 |
+
# else:
|
| 455 |
+
# print("✅ Passed Rule 8: No punycode")
|
| 456 |
+
|
| 457 |
+
# subdomains = ext.subdomain.split('.') if ext.subdomain else []
|
| 458 |
+
# if len(subdomains) > 4:
|
| 459 |
+
# print("❌ Failed Rule 9: Excessive subdomains")
|
| 460 |
+
# return True, "🌳 Excessive subdomains"
|
| 461 |
+
# else:
|
| 462 |
+
# print("✅ Passed Rule 9: Subdomain count is normal")
|
| 463 |
+
|
| 464 |
+
# if re.match(r'^\d{1,3}(-\d{1,3}){3}$', domain_name.replace('.', '-')):
|
| 465 |
+
# print("❌ Failed Rule 10: Domain name formatted like an IP")
|
| 466 |
+
# return True, "🔢 Domain name formatted like an IP"
|
| 467 |
+
# else:
|
| 468 |
+
# print("✅ Passed Rule 10: Domain name is not IP-like")
|
| 469 |
+
|
| 470 |
+
# phishing_keywords = [
|
| 471 |
+
# 'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
|
| 472 |
+
# 'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
|
| 473 |
+
# 'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
|
| 474 |
+
# 'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
|
| 475 |
+
# 'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
|
| 476 |
+
# ]
|
| 477 |
+
# full_url_parts = url + path + query + fragment
|
| 478 |
+
# if any(keyword in full_url_parts for keyword in phishing_keywords):
|
| 479 |
+
# print("❌ Failed Rule 11: Contains phishing keyword")
|
| 480 |
+
# return True, "🔍 Contains phishing keyword"
|
| 481 |
+
# else:
|
| 482 |
+
# print("✅ Passed Rule 11: No phishing keywords found")
|
| 483 |
+
|
| 484 |
+
# if len(path) > 100:
|
| 485 |
+
# print("❌ Failed Rule 12: Very long path")
|
| 486 |
+
# return True, "📜 Very long URL path"
|
| 487 |
+
# else:
|
| 488 |
+
# print("✅ Passed Rule 12: Path length is acceptable")
|
| 489 |
+
|
| 490 |
+
# suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
|
| 491 |
+
# if any(path.endswith(ext) for ext in suspicious_extensions):
|
| 492 |
+
# print("❌ Failed Rule 13: Suspicious file extension")
|
| 493 |
+
# return True, "📁 Suspicious file extension in path"
|
| 494 |
+
# else:
|
| 495 |
+
# print("✅ Passed Rule 13: No suspicious file extension")
|
| 496 |
+
|
| 497 |
+
# if any(param in query for param in ['redirect=', 'url=', 'goto=', 'link=']):
|
| 498 |
+
# print("❌ Failed Rule 14: Redirect pattern in query")
|
| 499 |
+
# return True, "🔗 Potential redirect link"
|
| 500 |
+
# else:
|
| 501 |
+
# print("✅ Passed Rule 14: No redirect pattern in query")
|
| 502 |
+
|
| 503 |
+
# # Gibberish Check
|
| 504 |
+
# parts = re.split(r'[\/\.\-\_\?\=\&]', url)
|
| 505 |
+
# long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
|
| 506 |
+
# gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
|
| 507 |
+
# if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
|
| 508 |
+
# print("❌ Failed Rule 15: Mostly gibberish words")
|
| 509 |
+
# return True, "🧾 Mostly gibberish / non-dictionary words"
|
| 510 |
+
# else:
|
| 511 |
+
# print("✅ Passed Rule 15: Words are mostly valid")
|
| 512 |
+
|
| 513 |
+
# print("✅ All rule-based checks passed")
|
| 514 |
+
# return False, None
|
| 515 |
|
| 516 |
+
#end of navis baiyes
|
| 517 |
+
#start of navi# --- Dictionary Words ---
|
| 518 |
+
# valid_words = set(words.words())
|
| 519 |
+
|
| 520 |
+
# # --- Load Trusted Keywords ---
|
| 521 |
+
# def load_trusted_keywords(file_path):
|
| 522 |
+
# with open(file_path, 'r', encoding='utf-8') as f:
|
| 523 |
+
# return set(line.strip().lower() for line in f if line.strip())
|
| 524 |
+
|
| 525 |
+
# # --- Whitelist (common safe domains/services) ---
|
| 526 |
+
# whitelist = set([
|
| 527 |
+
# # Search Engines
|
| 528 |
+
# 'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
|
| 529 |
+
|
| 530 |
+
# # Social Media
|
| 531 |
+
# 'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
|
| 532 |
+
# 'threads', 'pinterest', 'reddit', 'quora',
|
| 533 |
+
|
| 534 |
+
# # Communication Tools
|
| 535 |
+
# 'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
|
| 536 |
+
# 'teams', 'signal', 'messenger',
|
| 537 |
+
|
| 538 |
+
# # Global E-commerce
|
| 539 |
+
# 'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
|
| 540 |
+
# 'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
|
| 541 |
+
|
| 542 |
+
# # Indian E-commerce / Services
|
| 543 |
+
# 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
|
| 544 |
+
# 'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
|
| 545 |
+
# 'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit',
|
| 546 |
+
# 'universalcollegeofengineering',
|
| 547 |
+
|
| 548 |
+
# # Education / Productivity
|
| 549 |
+
# 'youtube', 'docs', 'drive', 'calendar', 'photos', 'gmail', 'notion',
|
| 550 |
+
# 'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
|
| 551 |
+
|
| 552 |
+
# # News / Media / Tech
|
| 553 |
+
# 'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
|
| 554 |
+
# 'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
|
| 555 |
+
# 'techcrunch', 'verge', 'wired',
|
| 556 |
+
|
| 557 |
+
# # Streaming / Entertainment
|
| 558 |
+
# 'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
|
| 559 |
+
|
| 560 |
+
# # Dev & Tools
|
| 561 |
+
# 'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
|
| 562 |
+
# 'adobe', 'figma', 'canva',
|
| 563 |
+
|
| 564 |
+
# # Financial / Banking
|
| 565 |
+
# 'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
|
| 566 |
+
# 'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
|
| 567 |
+
|
| 568 |
+
# # Government / Utilities
|
| 569 |
+
# 'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
|
| 570 |
+
|
| 571 |
+
# # Others Common
|
| 572 |
+
# 'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
|
| 573 |
+
# 'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
|
| 574 |
+
# ])
|
| 575 |
+
|
| 576 |
+
# # --- Gibberish Check Helper ---
|
| 577 |
+
# def is_gibberish_word(word):
|
| 578 |
+
# word = word.lower()
|
| 579 |
+
# if len(word) < 4:
|
| 580 |
+
# return False
|
| 581 |
+
# if not word.isalpha():
|
| 582 |
+
# return True
|
| 583 |
+
# return word not in valid_words
|
| 584 |
+
|
| 585 |
+
|
| 586 |
+
# # --- RULE BASED CHECK ---
|
| 587 |
+
# def is_rule_based_spam(url, skip_gibberish=False):
|
| 588 |
+
# url = url.strip().lower()
|
| 589 |
+
# print(f"\n🌐 Checking URL: {url}")
|
| 590 |
|
| 591 |
+
# try:
|
| 592 |
+
# parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
|
| 593 |
+
# domain = parsed.netloc
|
| 594 |
+
# path = parsed.path
|
| 595 |
+
# query = parsed.query
|
| 596 |
+
# fragment = parsed.fragment
|
| 597 |
+
# except Exception as e:
|
| 598 |
+
# return True, f"❌ Malformed URL: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
|
| 600 |
+
# if not domain:
|
| 601 |
+
# return True, "❌ Empty domain after parsing"
|
|
|
|
|
|
|
|
|
|
| 602 |
|
| 603 |
+
# # Rule 1: Dot in domain
|
| 604 |
+
# if '.' not in domain:
|
| 605 |
+
# return True, "❌ Domain missing dot (.)"
|
|
|
|
|
|
|
|
|
|
| 606 |
|
| 607 |
+
# # Trusted TLDs
|
| 608 |
+
# trusted_tlds = [
|
| 609 |
+
# '.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int',
|
| 610 |
+
# '.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in'
|
| 611 |
+
# ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
|
| 613 |
+
# try:
|
| 614 |
+
# ext = tldextract.extract(url)
|
| 615 |
+
# domain_name = ext.domain
|
| 616 |
+
# suffix = ext.suffix
|
| 617 |
+
# subdomains = ext.subdomain.split('.') if ext.subdomain else []
|
| 618 |
+
# except Exception:
|
| 619 |
+
# return True, "❌ Cannot extract domain/suffix"
|
| 620 |
+
|
| 621 |
+
# # --- WHITELIST / TRUSTED SKIP ---
|
| 622 |
+
# if any(tld in domain for tld in trusted_tlds) or domain_name in whitelist:
|
| 623 |
+
# print("✅ Trusted/whitelisted → gibberish will be skipped")
|
| 624 |
+
# skip_gibberish = True
|
| 625 |
+
|
| 626 |
+
# # Rule 3: IP as domain
|
| 627 |
+
# if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain):
|
| 628 |
+
# return True, "📟 IP address instead of domain"
|
| 629 |
+
|
| 630 |
+
# # Rule 4: Bad TLD
|
| 631 |
+
# bad_tlds = ['.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn',
|
| 632 |
+
# '.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science',
|
| 633 |
+
# '.stream', '.webcam', '.online', '.site', '.website', '.space',
|
| 634 |
+
# '.club', '.buzz', '.info']
|
| 635 |
+
# if any(suffix == tld.lstrip('.') for tld in bad_tlds):
|
| 636 |
+
# return True, "🧨 Suspicious top-level domain"
|
| 637 |
+
|
| 638 |
+
# # Rule 5: Long domain
|
| 639 |
+
# if len(domain_name) > 30:
|
| 640 |
+
# return True, "📏 Very long and unrecognized domain name"
|
| 641 |
+
|
| 642 |
+
# # Rule 6: Numeric-heavy
|
| 643 |
+
# numeric_chars = sum(c.isdigit() for c in domain_name)
|
| 644 |
+
# if len(domain_name) > 5 and (numeric_chars / len(domain_name)) > 0.5:
|
| 645 |
+
# return True, "🔢 Numeric-heavy domain name"
|
| 646 |
+
|
| 647 |
+
# # Rule 7: Special characters
|
| 648 |
+
# if domain_name.count('-') > 3 or re.search(r'[!@#$%^&*()_+={}\[\]|\\:;"\'<>,?/`~]', domain_name):
|
| 649 |
+
# return True, "➖ Excessive hyphens or special characters in domain"
|
| 650 |
+
|
| 651 |
+
# # Rule 8: Punycode
|
| 652 |
+
# if domain_name.startswith('xn--'):
|
| 653 |
+
# return True, "🌐 Punycode detected (potential homograph attack)"
|
| 654 |
+
|
| 655 |
+
# # Rule 9: Excessive subdomains
|
| 656 |
+
# if len(subdomains) > 4:
|
| 657 |
+
# return True, "🌳 Excessive subdomains"
|
| 658 |
+
|
| 659 |
+
# # Rule 10: Domain looks like IP
|
| 660 |
+
# if re.match(r'^\d{1,3}(-\d{1,3}){3}$', domain_name.replace('.', '-')):
|
| 661 |
+
# return True, "🔢 Domain name formatted like an IP"
|
| 662 |
+
|
| 663 |
+
# # Rule 11: Phishing keywords
|
| 664 |
+
# phishing_keywords = [
|
| 665 |
+
# 'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
|
| 666 |
+
# 'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
|
| 667 |
+
# 'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
|
| 668 |
+
# 'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
|
| 669 |
+
# 'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
|
| 670 |
+
# ]
|
| 671 |
+
# full_url_parts = url + path + query + fragment
|
| 672 |
+
# if any(keyword in full_url_parts for keyword in phishing_keywords):
|
| 673 |
+
# return True, "🔍 Contains phishing keyword"
|
| 674 |
+
|
| 675 |
+
# # Rule 12: Long path
|
| 676 |
+
# if len(path) > 100:
|
| 677 |
+
# return True, "📜 Very long URL path"
|
| 678 |
+
|
| 679 |
+
# # Rule 13: Suspicious file extensions
|
| 680 |
+
# suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
|
| 681 |
+
# if any(path.endswith(ext) for ext in suspicious_extensions):
|
| 682 |
+
# return True, "📁 Suspicious file extension in path"
|
| 683 |
+
|
| 684 |
+
# # Rule 14: Redirect in query
|
| 685 |
+
# if any(param in query for param in ['redirect=', 'url=', 'goto=', 'link=']):
|
| 686 |
+
# return True, "🔗 Potential redirect link"
|
| 687 |
+
|
| 688 |
+
# # Rule 15: Gibberish (only if not skipped)
|
| 689 |
+
# if not skip_gibberish:
|
| 690 |
+
# parts = re.split(r'[\/\.\-\_\?\=\&]', url)
|
| 691 |
+
# long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
|
| 692 |
+
# gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
|
| 693 |
+
# if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
|
| 694 |
+
# return True, "🧾 Mostly gibberish / non-dictionary words"
|
| 695 |
+
|
| 696 |
+
# return False, None
|
| 697 |
+
# #end of navbaiesd
|
| 698 |
|
| 699 |
|
| 700 |
|
|
|
|
| 999 |
|
| 1000 |
def clean_text(text):
|
| 1001 |
return text.lower().strip()
|
| 1002 |
+
|
| 1003 |
+
import re
|
| 1004 |
+
|
| 1005 |
+
# Load saved model and vectorizer
|
| 1006 |
+
model = joblib.load("Models/logistic_model.pkl")
|
| 1007 |
+
vectorizer = joblib.load("Models/logvectorizer.pkl")
|
| 1008 |
+
|
| 1009 |
+
# Text cleaning
|
| 1010 |
+
def clean_text(text):
|
| 1011 |
+
text = text.lower()
|
| 1012 |
+
text = re.sub(r'\W', ' ', text)
|
| 1013 |
+
text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
|
| 1014 |
+
text = re.sub(r'\s+', ' ', text)
|
| 1015 |
+
return text.strip()
|
| 1016 |
|
| 1017 |
@app.route('/logistic', methods=['GET', 'POST'])
|
| 1018 |
def logistic():
|
| 1019 |
+
prediction, confidence_percentage, cleaned, tokens, probability = None, None, None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1020 |
|
| 1021 |
if request.method == "POST":
|
| 1022 |
+
msg = request.form.get('message', '')
|
| 1023 |
+
cleaned = clean_text(msg)
|
| 1024 |
+
tokens = cleaned.split()
|
| 1025 |
+
|
| 1026 |
try:
|
|
|
|
|
|
|
| 1027 |
vector = vectorizer.transform([cleaned])
|
| 1028 |
probability = model.predict_proba(vector)[0][1]
|
| 1029 |
prediction = "Spam" if probability >= 0.5 else "Not Spam"
|
| 1030 |
confidence_percentage = round(probability * 100, 2)
|
| 1031 |
except Exception as e:
|
| 1032 |
+
print("Error predicting:", e)
|
| 1033 |
prediction = "Error"
|
| 1034 |
+
confidence_percentage = 0
|
| 1035 |
+
|
| 1036 |
+
return render_template(
|
| 1037 |
+
"logistic.html",
|
| 1038 |
+
prediction=prediction,
|
| 1039 |
+
confidence_percentage=confidence_percentage,
|
| 1040 |
+
cleaned=cleaned,
|
| 1041 |
+
tokens=tokens,
|
| 1042 |
+
probability=round(probability, 4) if probability else None,
|
| 1043 |
+
source="sms"
|
| 1044 |
+
)
|
| 1045 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1046 |
@app.route('/logistic-sms', methods=['POST'])
|
| 1047 |
def logistic_sms():
|
| 1048 |
try:
|
| 1049 |
data = request.get_json()
|
| 1050 |
msg = data.get('message', '')
|
| 1051 |
cleaned = clean_text(msg)
|
| 1052 |
+
tokens = cleaned.split()
|
| 1053 |
+
|
| 1054 |
vector = vectorizer.transform([cleaned])
|
| 1055 |
probability = model.predict_proba(vector)[0][1]
|
| 1056 |
prediction = "Spam" if probability >= 0.5 else "Not Spam"
|
|
|
|
| 1061 |
"confidence": confidence_percentage,
|
| 1062 |
"probability": round(probability, 4),
|
| 1063 |
"cleaned": cleaned,
|
| 1064 |
+
"tokens": tokens,
|
|
|
|
| 1065 |
"source": "json"
|
| 1066 |
})
|
| 1067 |
|
|
|
|
| 1460 |
|
| 1461 |
# --- Naive Bayes Routes ---
|
| 1462 |
|
| 1463 |
+
from urllib.parse import urlparse
|
| 1464 |
+
from sklearn.naive_bayes import GaussianNB
|
| 1465 |
+
from nltk.corpus import words
|
| 1466 |
|
| 1467 |
+
model_path = "Models/nb_url_model.pkl"
|
| 1468 |
+
vectorizer_path = "Models/nb_url_vectorizer.pkl"
|
| 1469 |
|
| 1470 |
+
if os.path.exists(model_path) and os.path.exists(vectorizer_path):
|
| 1471 |
+
nb_model = joblib.load(model_path)
|
| 1472 |
+
vectorizer = joblib.load(vectorizer_path)
|
| 1473 |
+
print("✅ Loaded Naive Bayes URL model")
|
| 1474 |
+
else:
|
| 1475 |
+
nb_model, vectorizer = None, None
|
| 1476 |
+
print("❌ Model/vectorizer not found")
|
| 1477 |
|
| 1478 |
|
| 1479 |
@app.route('/nb_spam')
|
| 1480 |
def nb_spam_page():
|
| 1481 |
return render_template('NB_spam.html')
|
| 1482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1483 |
|
| 1484 |
+
import re
|
| 1485 |
+
from urllib.parse import urlparse
|
| 1486 |
+
from spellchecker import SpellChecker
|
| 1487 |
+
import wordninja
|
| 1488 |
|
|
|
|
|
|
|
| 1489 |
|
|
|
|
|
|
|
|
|
|
| 1490 |
|
| 1491 |
+
# ---- Whitelist (your full one, unchanged) ----
|
| 1492 |
+
whitelist = set([
|
| 1493 |
+
# Search Engines
|
| 1494 |
+
'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
|
| 1495 |
|
| 1496 |
+
# Social Media
|
| 1497 |
+
'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
|
| 1498 |
+
'threads', 'pinterest', 'reddit', 'quora',
|
|
|
|
|
|
|
| 1499 |
|
| 1500 |
+
# Communication Tools
|
| 1501 |
+
'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
|
| 1502 |
+
'teams', 'signal', 'messenger',
|
| 1503 |
|
| 1504 |
+
# Global E-commerce
|
| 1505 |
+
'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
|
| 1506 |
+
'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
|
|
|
|
|
|
|
| 1507 |
|
| 1508 |
+
# Indian E-commerce / Services
|
| 1509 |
+
'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
|
| 1510 |
+
'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
|
| 1511 |
+
'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit',
|
| 1512 |
+
'universalcollegeofengineering',
|
| 1513 |
|
| 1514 |
+
# Education / Productivity
|
| 1515 |
+
'youtube', 'docs', 'drive', 'calendar', 'photos', 'gmail', 'notion',
|
| 1516 |
+
'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
|
| 1517 |
|
| 1518 |
+
# News / Media / Tech
|
| 1519 |
+
'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
|
| 1520 |
+
'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
|
| 1521 |
+
'techcrunch', 'verge', 'wired',
|
| 1522 |
+
|
| 1523 |
+
# Streaming / Entertainment
|
| 1524 |
+
'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
|
| 1525 |
+
|
| 1526 |
+
# Dev & Tools
|
| 1527 |
+
'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
|
| 1528 |
+
'adobe', 'figma', 'canva',
|
| 1529 |
+
|
| 1530 |
+
# Financial / Banking
|
| 1531 |
+
'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
|
| 1532 |
+
'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
|
| 1533 |
+
|
| 1534 |
+
# Government / Utilities
|
| 1535 |
+
'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
|
| 1536 |
+
|
| 1537 |
+
# Others Common
|
| 1538 |
+
'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
|
| 1539 |
+
'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
|
| 1540 |
+
])
|
| 1541 |
+
|
| 1542 |
+
# ... your full whitelist from before ...
|
| 1543 |
+
|
| 1544 |
+
|
| 1545 |
+
# ---- Trusted & Bad TLDs ----
|
| 1546 |
+
trusted_tlds = [
|
| 1547 |
+
'.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int',
|
| 1548 |
+
'.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in'
|
| 1549 |
+
]
|
| 1550 |
+
|
| 1551 |
+
# Expanded Bad TLDs (Rule 4)
|
| 1552 |
+
bad_tlds = [
|
| 1553 |
+
'.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn',
|
| 1554 |
+
'.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science',
|
| 1555 |
+
'.stream', '.webcam', '.online', '.site', '.website', '.space',
|
| 1556 |
+
'.club', '.buzz', '.info'
|
| 1557 |
+
]
|
| 1558 |
+
|
| 1559 |
+
# Suspicious extensions (Rule 13)
|
| 1560 |
+
suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
|
| 1561 |
+
|
| 1562 |
+
# Phishing keywords (Rule 11, your full list)
|
| 1563 |
+
phishing_keywords = [
|
| 1564 |
+
'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
|
| 1565 |
+
'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
|
| 1566 |
+
'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
|
| 1567 |
+
'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
|
| 1568 |
+
'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
|
| 1569 |
+
]
|
| 1570 |
+
|
| 1571 |
+
# ---- Rules 5–14 ----
|
| 1572 |
+
rules = {
|
| 1573 |
+
5: r"https?://\d{1,3}(\.\d{1,3}){3}",
|
| 1574 |
+
6: r"@[A-Za-z0-9.-]+\.[A-Za-z]{2,}",
|
| 1575 |
+
7: r"(free money|win now|click here)",
|
| 1576 |
+
8: r"https?://[^\s]*\.(ru|cn|tk)",
|
| 1577 |
+
9: r"https?://.{0,6}\..{2,6}/.{0,6}",
|
| 1578 |
+
10: r"[0-9]{10,}",
|
| 1579 |
+
12: r"https?://[^\s]*@[^\s]+",
|
| 1580 |
+
13: r"https?://[^\s]*//[^\s]+",
|
| 1581 |
+
14: r"https?://[^\s]*\?(?:[^=]+=[^&]*&){5,}",
|
| 1582 |
+
}
|
| 1583 |
+
|
| 1584 |
+
|
| 1585 |
+
# ---- Gibberish Check Helper (Rule 15) ----
|
| 1586 |
+
def is_gibberish_word(word):
|
| 1587 |
+
vowels = "aeiou"
|
| 1588 |
+
v_count = sum(c in vowels for c in word)
|
| 1589 |
+
return v_count / len(word) < 0.25
|
| 1590 |
+
|
| 1591 |
+
# # ---- Utility: Extract words from URL ----
|
| 1592 |
+
# def extract_words(url):
|
| 1593 |
+
# parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
|
| 1594 |
+
# raw = parsed.netloc.replace('-', '') + parsed.path.replace('-', '')
|
| 1595 |
+
# # Split using wordninja
|
| 1596 |
+
# words = wordninja.split(raw.lower())
|
| 1597 |
+
# # Keep only alphabetic words of length >= 3
|
| 1598 |
+
# words = [w for w in words if w.isalpha() and len(w) >= 3]
|
| 1599 |
+
# return words
|
| 1600 |
+
# ---- Extract words from URL ----
|
| 1601 |
+
def extract_words(url):
|
| 1602 |
+
parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
|
| 1603 |
+
parts = re.split(r'\W+', parsed.netloc + parsed.path)
|
| 1604 |
+
final_words = []
|
| 1605 |
+
for word in parts:
|
| 1606 |
+
if len(word) > 2 and word.isalpha():
|
| 1607 |
+
split_words = wordninja.split(word.lower())
|
| 1608 |
+
if len(split_words) <= 1:
|
| 1609 |
+
split_words = [word.lower()]
|
| 1610 |
+
final_words.extend(split_words)
|
| 1611 |
+
return final_words
|
| 1612 |
+
|
| 1613 |
+
|
| 1614 |
+
# --- Your original predict function, now inside the Flask app ---
|
| 1615 |
+
@app.route("/predict", methods=["POST"])
|
| 1616 |
+
def predict():
|
| 1617 |
+
try:
|
| 1618 |
+
data = request.get_json()
|
| 1619 |
+
url = data.get("url", "").lower()
|
| 1620 |
+
if not url:
|
| 1621 |
+
return jsonify({'error': 'No URL provided'}), 400
|
| 1622 |
+
|
| 1623 |
+
parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
|
| 1624 |
+
path = parsed.path
|
| 1625 |
|
| 1626 |
+
# ---- SpellChecker using built-in dictionary ----
|
| 1627 |
spell = SpellChecker(distance=1)
|
| 1628 |
|
| 1629 |
+
# ---- Extract words and check spelling ----
|
| 1630 |
+
words = extract_words(url)
|
| 1631 |
+
# ignore known TLDs
|
| 1632 |
+
tlds_to_ignore = [tld.replace('.', '',"/") for tld in trusted_tlds + bad_tlds]
|
| 1633 |
+
words_for_spellcheck = [w for w in words if w not in tlds_to_ignore]
|
| 1634 |
+
|
| 1635 |
+
misspelled = spell.unknown(words_for_spellcheck)
|
| 1636 |
+
steps = [{"word": w, "valid": (w not in misspelled) or (w in tlds_to_ignore)} for w in words]
|
| 1637 |
+
|
| 1638 |
+
if misspelled:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1639 |
return jsonify({
|
| 1640 |
+
"prediction": 1,
|
| 1641 |
+
"reason": f"🧾 Spelling errors: {', '.join(misspelled)}",
|
| 1642 |
+
"steps": steps
|
| 1643 |
+
})
|
| 1644 |
+
else:
|
| 1645 |
+
return jsonify({
|
| 1646 |
+
"prediction": 0,
|
| 1647 |
+
"reason": "✅ No spelling issues",
|
| 1648 |
+
"steps": steps
|
| 1649 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1650 |
|
| 1651 |
except Exception as e:
|
| 1652 |
+
return jsonify({'error': f"An issue occurred during spell checking: {str(e)}"}), 500
|
|
|
|
|
|
|
| 1653 |
|
| 1654 |
|
| 1655 |
|
|
|
|
| 1847 |
|
| 1848 |
if __name__ == '__main__':
|
| 1849 |
#app.run(debug=True, port=5000)
|
| 1850 |
+
app.run(debug=True,use_reloader=False)
|
load_file.py
CHANGED
|
@@ -8,15 +8,26 @@ load_dotenv()
|
|
| 8 |
# Get token from environment
|
| 9 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 10 |
|
| 11 |
-
# Login (only needed if you don
|
| 12 |
login(token=HF_TOKEN)
|
| 13 |
|
| 14 |
-
REPO_ID = "deedrop1140/
|
| 15 |
|
| 16 |
def load_file(filename):
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
)
|
| 22 |
return file_path
|
|
|
|
|
|
| 8 |
# Get token from environment
|
| 9 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 10 |
|
| 11 |
+
# Login (only needed if you don't use huggingface-cli)
|
| 12 |
login(token=HF_TOKEN)
|
| 13 |
|
| 14 |
+
REPO_ID = "deedrop1140/Neroml" # Replace with your repository ID
|
| 15 |
|
| 16 |
def load_file(filename):
|
| 17 |
+
"""
|
| 18 |
+
Downloads a specified file from the Hugging Face Hub repository.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
filename (str): The name of the file to download from the repository.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
str: The local path where the downloaded file is stored.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
file_path = hf_hub_download(
|
| 28 |
+
repo_id=REPO_ID,
|
| 29 |
+
filename=filename,
|
| 30 |
+
token=HF_TOKEN # token is loaded from environment
|
| 31 |
)
|
| 32 |
return file_path
|
| 33 |
+
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|
save_token.py
CHANGED
|
@@ -2,4 +2,4 @@ import os
|
|
| 2 |
from huggingface_hub import HfApi
|
| 3 |
|
| 4 |
token = os.getenv("HF_TOKEN") # loaded from .env or system environment
|
| 5 |
-
api = HfApi(token=token)
|
|
|
|
| 2 |
from huggingface_hub import HfApi
|
| 3 |
|
| 4 |
token = os.getenv("HF_TOKEN") # loaded from .env or system environment
|
| 5 |
+
api = HfApi(token=token)
|
train_logistic_model.py
CHANGED
|
@@ -1,55 +1,47 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import re
|
| 3 |
-
|
|
|
|
| 4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
from sklearn.linear_model import LogisticRegression
|
| 6 |
-
import
|
| 7 |
-
import os
|
| 8 |
|
| 9 |
-
# Load
|
| 10 |
df = pd.read_csv("data/spam.csv", encoding='latin-1')
|
| 11 |
-
|
| 12 |
-
# Only keep the columns you need (for spam.csv structure)
|
| 13 |
df = df[['v1', 'v2']]
|
| 14 |
df.columns = ['label', 'message']
|
| 15 |
-
|
| 16 |
-
# Drop duplicates
|
| 17 |
df.drop_duplicates(inplace=True)
|
| 18 |
-
|
| 19 |
-
# Fill missing values in messages with empty string (text can't use mean)
|
| 20 |
df['message'] = df['message'].fillna("")
|
| 21 |
-
|
| 22 |
-
# Fill missing values in label with mode (most common class)
|
| 23 |
df['label'] = df['label'].fillna(df['label'].mode()[0])
|
| 24 |
|
| 25 |
-
# Clean
|
| 26 |
def clean_text(text):
|
| 27 |
-
text = text.lower()
|
| 28 |
-
text = re.sub(r'\W', ' ', text)
|
| 29 |
-
text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
|
| 30 |
-
text = re.sub(r'\s+', ' ', text)
|
| 31 |
return text.strip()
|
| 32 |
|
| 33 |
df['message'] = df['message'].apply(clean_text)
|
| 34 |
-
|
| 35 |
-
# Label encoding: spam = 1, ham = 0
|
| 36 |
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
vectorizer = TfidfVectorizer()
|
| 43 |
X_train_vec = vectorizer.fit_transform(X_train)
|
| 44 |
X_test_vec = vectorizer.transform(X_test)
|
| 45 |
|
| 46 |
-
#
|
| 47 |
model = LogisticRegression()
|
| 48 |
model.fit(X_train_vec, y_train)
|
| 49 |
|
| 50 |
-
# Save model
|
| 51 |
os.makedirs("Models", exist_ok=True)
|
| 52 |
joblib.dump(model, "Models/logistic_model.pkl")
|
| 53 |
-
joblib.dump(vectorizer, "Models/
|
| 54 |
|
| 55 |
-
print("✅ Logistic model trained
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import re
|
| 3 |
+
import os
|
| 4 |
+
import joblib
|
| 5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 6 |
from sklearn.linear_model import LogisticRegression
|
| 7 |
+
from sklearn.model_selection import train_test_split
|
|
|
|
| 8 |
|
| 9 |
+
# Load data
|
| 10 |
df = pd.read_csv("data/spam.csv", encoding='latin-1')
|
|
|
|
|
|
|
| 11 |
df = df[['v1', 'v2']]
|
| 12 |
df.columns = ['label', 'message']
|
|
|
|
|
|
|
| 13 |
df.drop_duplicates(inplace=True)
|
|
|
|
|
|
|
| 14 |
df['message'] = df['message'].fillna("")
|
|
|
|
|
|
|
| 15 |
df['label'] = df['label'].fillna(df['label'].mode()[0])
|
| 16 |
|
| 17 |
+
# Clean text
|
| 18 |
def clean_text(text):
|
| 19 |
+
text = text.lower()
|
| 20 |
+
text = re.sub(r'\W', ' ', text)
|
| 21 |
+
text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
|
| 22 |
+
text = re.sub(r'\s+', ' ', text)
|
| 23 |
return text.strip()
|
| 24 |
|
| 25 |
df['message'] = df['message'].apply(clean_text)
|
|
|
|
|
|
|
| 26 |
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
|
| 27 |
|
| 28 |
+
# Train-test split
|
| 29 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 30 |
+
df['message'], df['label'], test_size=0.2, random_state=42
|
| 31 |
+
)
|
| 32 |
|
| 33 |
+
# Vectorize
|
| 34 |
+
vectorizer = TfidfVectorizer(max_features=5000)
|
| 35 |
X_train_vec = vectorizer.fit_transform(X_train)
|
| 36 |
X_test_vec = vectorizer.transform(X_test)
|
| 37 |
|
| 38 |
+
# Train model
|
| 39 |
model = LogisticRegression()
|
| 40 |
model.fit(X_train_vec, y_train)
|
| 41 |
|
| 42 |
+
# Save model
|
| 43 |
os.makedirs("Models", exist_ok=True)
|
| 44 |
joblib.dump(model, "Models/logistic_model.pkl")
|
| 45 |
+
joblib.dump(vectorizer, "Models/logvectorizer.pkl")
|
| 46 |
|
| 47 |
+
print("✅ Logistic model trained & saved successfully!")
|