deedrop1140 commited on
Commit
cc18324
·
verified ·
1 Parent(s): 3dc6c0d

Upload 72 files

Browse files
.gitattributes CHANGED
@@ -1,36 +1,36 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- Static/decision_tree.png filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Models/Unconfirmed[[:space:]]784952.crdownload filter=lfs diff=lfs merge=lfs -text
Models/Unconfirmed 784952.crdownload ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76814785969081fb542eb90f1adca0b7e08af310da68ab91231c806c4e3d53d
3
+ size 69189991
Models/linear_model (1).pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e686db9126ad24dbdd3eaee6b9915cce209e0c703e3279c23787cdb3f1fa6e7a
3
+ size 577
Models/logistic_model.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:436ed4986a74683a04c42554ee5e827e963971f108dee7cf7974e5e05e83a6b7
3
- size 62127
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57c8921a04cc148eb213bc4e1d21bf7d4e027401ea0dbe272567d6d6dd12d920
3
+ size 40863
Models/logvectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e51b1d8b6c8975d5469c9c7540af43fab5ac2bdce0008d7109cfdab4fd481917
3
+ size 160142
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Neroml
3
+ emoji: 📉
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.43.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: IT is a web page that teach ml algorithm with visualisation
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Templates/NB_spam.html CHANGED
@@ -72,74 +72,74 @@
72
  </style>
73
  </head>
74
  <body>
75
- <h1>🔍 Naive Bayes URL Spam Checker</h1>
76
-
77
- <input type="text" id="urlInput" placeholder="Enter URL (e.g. http://example.com)">
78
- <br>
79
- <button onclick="checkURL()">Check</button>
80
-
81
- <div id="result"></div>
82
- <div id="spellSteps"></div>
83
- <div id="reason"></div>
84
-
85
- <script>
86
- async function checkURL() {
87
- const url = document.getElementById("urlInput").value.trim();
88
- const resultDiv = document.getElementById("result");
89
- const reasonDiv = document.getElementById("reason");
90
- const spellStepsDiv = document.getElementById("spellSteps");
91
-
92
- // Clear previous outputs
93
- resultDiv.innerHTML = "⏳ Checking...";
94
- reasonDiv.innerHTML = "";
95
- spellStepsDiv.innerHTML = "";
96
-
97
- try {
98
- const response = await fetch('/predict', {
99
- method: 'POST',
100
- headers: { 'Content-Type': 'application/json' },
101
- body: JSON.stringify({ url: url })
102
- });
103
-
104
- const data = await response.json();
105
-
106
- // Show result
107
- if (data.prediction === 1) {
108
- resultDiv.innerHTML = "🚫 <span class='spam'>SPAM / PHISHING</span>";
109
- } else {
110
- resultDiv.innerHTML = "✅ <span class='safe'>This URL is SAFE</span>";
111
- }
112
-
113
- // Show reason
114
- if (data.reason) {
115
- reasonDiv.innerText = `🔍 Reason: ${data.reason}`;
116
- }
117
-
118
- // Show spell check steps
119
- if (data.steps && data.steps.length > 0) {
120
- const title = document.createElement("h3");
121
- title.innerText = "🧠 Spell Checker Log:";
122
- spellStepsDiv.appendChild(title);
123
-
124
- data.steps.forEach((step) => {
125
- const line = document.createElement("div");
126
- line.innerHTML = step.valid
127
- ? ` ${step.word} → Valid`
128
- : `❌ ${step.word} Misspelled`;
129
- line.style.color = step.valid ? "green" : "red";
130
- spellStepsDiv.appendChild(line);
131
- });
132
- }
133
-
134
- } catch (err) {
135
- resultDiv.innerHTML = "⚠️ Error checking the URL.";
136
- reasonDiv.innerText = err.message;
137
- }
138
- }
139
- </script>
140
- <div class="mt-6 text-center">
141
- <a href="/naive_bayes" class="inline-block bg-gray-200 hover:bg-gray-300 text-gray-800 px-4 py-2 rounded shadow">
142
- ← Back to Naive_bayes classification
143
- </a>
144
  </body>
145
- </html>
 
72
  </style>
73
  </head>
74
  <body>
75
+ <title>Naive Bayes URL Spam Checker</title>
76
+ </head>
77
+ <body>
78
+ <h1>🔍 Naive Bayes URL Spam Checker</h1>
79
+
80
+ <input type="text" id="urlInput" placeholder="Enter URL (e.g. http://example.com)">
81
+ <br>
82
+ <button onclick="checkURL()">Check</button>
83
+
84
+ <div id="result"></div>
85
+ <div id="spellSteps"></div>
86
+ <div id="reason"></div>
87
+
88
+ <script>
89
+ async function checkURL() {
90
+ const url = document.getElementById("urlInput").value.trim();
91
+ const resultDiv = document.getElementById("result");
92
+ const reasonDiv = document.getElementById("reason");
93
+ const spellStepsDiv = document.getElementById("spellSteps");
94
+
95
+ resultDiv.innerHTML = "⏳ Checking...";
96
+ reasonDiv.innerHTML = "";
97
+ spellStepsDiv.innerHTML = "";
98
+
99
+ try {
100
+ const response = await fetch('/predict', {
101
+ method: 'POST',
102
+ headers: { 'Content-Type': 'application/json' },
103
+ body: JSON.stringify({ url: url })
104
+ });
105
+
106
+ const data = await response.json();
107
+
108
+ if (data.prediction === 1) {
109
+ resultDiv.innerHTML = "🚫 <span class='spam'>SPAM / PHISHING</span>";
110
+ } else {
111
+ resultDiv.innerHTML = "✅ <span class='safe'>This URL is SAFE</span>";
112
+ }
113
+
114
+ if (data.reason) {
115
+ reasonDiv.innerText = `🔍 Reason: ${data.reason}`;
116
+ }
117
+
118
+ if (data.steps && data.steps.length > 0) {
119
+ const title = document.createElement("h3");
120
+ title.innerText = "🧠 Spell Checker Log:";
121
+ spellStepsDiv.appendChild(title);
122
+
123
+ data.steps.forEach((step) => {
124
+ const line = document.createElement("div");
125
+ line.innerHTML = step.valid
126
+ ? `✅ ${step.word} → Valid`
127
+ : ` ${step.word} → Misspelled`;
128
+ line.style.color = step.valid ? "green" : "red";
129
+ spellStepsDiv.appendChild(line);
130
+ });
131
+ }
132
+ } catch (err) {
133
+ resultDiv.innerHTML = "⚠️ Error checking the URL.";
134
+ reasonDiv.innerText = err.message;
135
+ }
136
+ }
137
+ </script>
138
+
139
+ <div class="mt-6 text-center">
140
+ <a href="/naive_bayes" class="inline-block bg-gray-200 hover:bg-gray-300 text-gray-800 px-4 py-2 rounded shadow">
141
+ Back to Naive Bayes classification
142
+ </a>
143
+ </div>
144
  </body>
145
+ </html>
Templates/logistic.html CHANGED
@@ -58,7 +58,7 @@
58
  <ul class="space-y-1 font-mono text-xs">
59
  <li><strong>Cleaned Text:</strong> {{ cleaned }}</li>
60
  <li><strong>Tokenized:</strong> {{ tokens }}</li>
61
- <li><strong>Vector:</strong> {{ vector }}</li>
62
  <li><strong>Sigmoid Output:</strong> {{ probability }}</li>
63
  <li><strong>Final Prediction:</strong> {{ prediction }}</li>
64
  </ul>
 
58
  <ul class="space-y-1 font-mono text-xs">
59
  <li><strong>Cleaned Text:</strong> {{ cleaned }}</li>
60
  <li><strong>Tokenized:</strong> {{ tokens }}</li>
61
+
62
  <li><strong>Sigmoid Output:</strong> {{ probability }}</li>
63
  <li><strong>Final Prediction:</strong> {{ prediction }}</li>
64
  </ul>
app.py CHANGED
@@ -38,6 +38,18 @@ from dotenv import load_dotenv
38
  import os
39
  from urllib.parse import urlparse
40
  import tldextract
 
 
 
 
 
 
 
 
 
 
 
 
41
  # Load environment variables from .env
42
  load_dotenv()
43
  #spam url import relateted
@@ -83,59 +95,59 @@ import google.generativeai as genai
83
 
84
 
85
  #huggung face code start
86
- from huggingface_hub import hf_hub_download
87
- import joblib
88
- import numpy as np
89
- import torch
90
-
91
- REPO_ID = "deedrop1140/my-ml-models"
92
-
93
- def load_file(filename):
94
- """Download a file from Hugging Face Hub and load it with the right library."""
95
- file_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
96
-
97
- if filename.endswith(".pkl") or filename.endswith(".joblib"):
98
- return joblib.load(file_path)
99
- elif filename.endswith(".npy"):
100
- return np.load(file_path, allow_pickle=True)
101
- elif filename.endswith(".pt") or filename.endswith(".pth"):
102
- return torch.load(file_path)
103
- else:
104
- return file_path
105
-
106
- # =====================
107
- # Replace your old model loads with this:
108
- # =====================
109
-
110
- # Models
111
- knn_model = load_file("Models/knn_model.pkl")
112
- lasso_model = load_file("Models/lasso_model.pkl")
113
- liar_model = load_file("Models/liar_model.joblib")
114
- linear_model = load_file("Models/linear_model.pkl")
115
- logistic_model = load_file("Models/logistic_model.pkl")
116
- nb_url_model = load_file("Models/nb_url_model.pkl")
117
- poly_model = load_file("Models/poly_model.pkl")
118
- rf_model = load_file("Models/rf_model.pkl")
119
- ridge_model = load_file("Models/ridge_model.pkl")
120
- supervised_model = load_file("Models/supervised_model.pkl")
121
- svr_model = load_file("Models/svr_model.pkl")
122
- voting_url_model = load_file("Models/voting_url_model.pkl")
123
-
124
- # Vectorizers / Encoders / Scalers
125
- label_classes = load_file("Models/label_classes.npy")
126
- label_encoder = load_file("Models/label_encoder.pkl")
127
- lasso_scaler = load_file("Models/lasso_scaler.pkl")
128
- liar_vectorizer = load_file("Models/liar_vectorizer.joblib")
129
- nb_url_vectorizer = load_file("Models/nb_url_vectorizer.pkl")
130
- poly_transform = load_file("Models/poly_transform.pkl")
131
- ridge_scaler = load_file("Models/ridge_scaler.pkl")
132
- svr_scaler_X = load_file("Models/svr_scaler_X.pkl")
133
- svr_scaler_y = load_file("Models/svr_scaler_y.pkl")
134
- tfidf_vectorizer = load_file("Models/tfidf_vectorizer.pkl")
135
- url_vectorizer = load_file("Models/url_vectorizer.pkl")
136
- vectorizer_joblib = load_file("Models/vectorizer.joblib")
137
- vectorizer_pkl = load_file("Models/vectorizer.pkl")
138
- # huggung face code end
139
 
140
  MODEL_DIR = "Models"
141
  DATA_DIR = "housedata" # Assuming your house data is here
@@ -155,7 +167,7 @@ def ask_gemini(statement):
155
  return response.text
156
 
157
  #rfc
158
- model = load("Models/liar_model.joblib")
159
  vectorizer = load("Models/liar_vectorizer.joblib")
160
 
161
  # Load BERT fact-checker pipeline (local model)
@@ -237,16 +249,27 @@ def get_house_data():
237
  loaded_models = {}
238
 
239
  # Load logistic model and vectorizer for SMS
240
- vectorizer = joblib.load("Models/vectorizer.pkl")
241
  model = joblib.load("Models/logistic_model.pkl")
242
 
 
 
243
  # Load models once NB+DT+SVM is trained
244
- model = joblib.load("Models/voting_url_model.pkl")
245
- vectorizer = joblib.load("Models/url_vectorizer.pkl")
 
 
 
 
 
 
246
  #END NB+DT+SVM
247
 
248
  # === Naive Bayes URL Spam Classifier (NB_spam.html) ===
249
  # === Load Model & Vectorizer ===
 
 
 
250
  VT_API_KEY = os.getenv("VT_API_KEY")
251
 
252
  model_path = os.path.join("Models", "nb_url_model.pkl")
@@ -266,236 +289,412 @@ else:
266
 
267
 
268
  # Load dictionary words
269
- valid_words = set(words.words())
270
 
271
- def load_trusted_keywords(file_path):
272
- with open(file_path, 'r', encoding='utf-8') as f:
273
- return set(line.strip().lower() for line in f if line.strip())
274
 
275
- # Load trusted colleges from file
276
- with open("data/trusted_colleges.txt", "r") as f:
277
- trusted_colleges = set(line.strip().lower() for line in f if line.strip())
278
 
279
 
280
 
281
- whitelist = set([
282
- # Search Engines
283
- 'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
284
 
285
- # Social Media
286
- 'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
287
- 'threads', 'pinterest', 'reddit', 'quora',
288
 
289
- # Communication Tools
290
- 'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
291
- 'teams', 'signal', 'messenger',
292
 
293
- # Global E-commerce
294
- 'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
295
- 'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
296
 
297
- # Indian E-commerce / Services
298
- 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
299
- 'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
300
- 'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit','https://universalcollegeofengineering.edu.in',
301
 
302
- # Education / Productivity
303
- 'youtube', 'docs', 'drive', 'calendar', 'photos', 'zoom',
304
- 'gmail', 'notion', 'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
305
 
306
- # News / Media / Tech
307
- 'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
308
- 'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
309
- 'techcrunch', 'verge', 'wired',
310
 
311
- # Streaming / Entertainment
312
- 'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
313
 
314
- # Dev & Tools
315
- 'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
316
- 'adobe', 'figma', 'canva',
317
 
318
- # Financial / Banking
319
- 'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
320
- 'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
321
 
322
- # Government / Utilities
323
- 'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
324
 
325
- # Others Common
326
- 'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
327
- 'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
328
- ])
329
 
330
 
331
- def is_gibberish_word(word):
332
- word = word.lower()
333
- if len(word) < 4:
334
- return False
335
- if not word.isalpha():
336
- return True
337
- return word not in valid_words
338
 
339
- def is_rule_based_spam(url):
340
- url = url.strip().lower()
341
- print(f"\n🌐 Checking URL: {url}")
342
 
343
- try:
344
- parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
345
- domain = parsed.netloc
346
- path = parsed.path
347
- query = parsed.query
348
- fragment = parsed.fragment
349
- except Exception as e:
350
- print("❌ Failed: Malformed URL")
351
- return True, f"❌ Malformed URL: {e}"
352
 
353
- if not domain:
354
- print("❌ Failed: Empty domain after parsing")
355
- return True, "❌ Empty domain after parsing"
356
- else:
357
- print("✅ Parsed domain:", domain)
358
 
359
- # --- Rules ---
360
 
361
- if '.' not in domain:
362
- print("❌ Failed Rule 1: Domain missing dot (.)")
363
- return True, "❌ Domain missing dot (.)"
364
- else:
365
- print("✅ Passed Rule 1: Domain contains dot")
366
 
367
- trusted_tlds = ['.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int', '.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in']
368
- if any(tld in domain for tld in trusted_tlds):
369
- print("✅ Passed Rule 2: Trusted TLD")
370
- else:
371
- print("✅ Passed Rule 2: Not a trusted TLD (but not blocked yet)")
372
 
373
- try:
374
- ext = tldextract.extract(url)
375
- domain_name = ext.domain
376
- suffix = ext.suffix
377
- print(f"✅ Extracted domain name: {domain_name}, suffix: {suffix}")
378
- except Exception:
379
- print("❌ Failed: Cannot extract domain/suffix")
380
- return True, "❌ Cannot extract domain/suffix"
381
- # Rule 2.5: Check if domain matches a trusted college keyword
382
- if any(college in domain for college in trusted_colleges):
383
- print("✅ Passed Rule 2.5: Trusted college name matched")
384
- return False, "✅ Trusted college"
385
- else:
386
- print("✅ Passed Rule 2.5: No trusted college matched (continue checking)")
387
 
388
- if domain_name in whitelist:
389
- print("✅ Skipping gibberish check for whitelisted domain")
390
- else:
391
- parts = re.split(r'[\/\.\-\_\?\=\&]', url)
392
- long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
393
- gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
394
 
395
- if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
396
- print("❌ Failed Rule 15: Mostly gibberish words")
397
- return True, "🧾 Mostly gibberish / non-dictionary words"
398
- else:
399
- print("✅ Passed Rule 15: Words are mostly valid")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
- if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain):
402
- print("❌ Failed Rule 3: IP address as domain")
403
- return True, "📟 IP address instead of domain"
404
- else:
405
- print("✅ Passed Rule 3: Domain is not an IP address")
406
-
407
- bad_tlds = ['.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn', '.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science', '.stream', '.webcam', '.online', '.site', '.website', '.space', '.club', '.buzz', '.info']
408
- if any(suffix == tld.lstrip('.') for tld in bad_tlds):
409
- print(f"❌ Failed Rule 4: Suspicious TLD ({suffix})")
410
- return True, "🧨 Suspicious top-level domain"
411
- else:
412
- print("✅ Passed Rule 4: TLD not in suspicious list")
413
-
414
- if len(domain_name) > 30:
415
- print("❌ Failed Rule 5: Very long domain name")
416
- return True, "📏 Very long and unrecognized domain name"
417
- else:
418
- print("✅ Passed Rule 5: Domain name length is acceptable")
419
-
420
- numeric_chars = sum(c.isdigit() for c in domain_name)
421
- if len(domain_name) > 5 and (numeric_chars / len(domain_name)) > 0.5:
422
- print("❌ Failed Rule 6: Numeric-heavy domain")
423
- return True, "🔢 Numeric-heavy domain name"
424
- else:
425
- print("✅ Passed Rule 6: Domain has few or no digits")
426
-
427
- if domain_name.count('-') > 3 or re.search(r'[!@#$%^&*()_+={}\[\]|\\:;"\'<>,?/`~]', domain_name):
428
- print("❌ Failed Rule 7: Too many special characters")
429
- return True, "➖ Excessive hyphens or special characters in domain"
430
- else:
431
- print("✅ Passed Rule 7: No excessive special characters")
432
-
433
- if domain_name.startswith('xn--'):
434
- print("❌ Failed Rule 8: Punycode detected")
435
- return True, "🌐 Punycode detected (potential homograph attack)"
436
- else:
437
- print("✅ Passed Rule 8: No punycode")
438
-
439
- subdomains = ext.subdomain.split('.') if ext.subdomain else []
440
- if len(subdomains) > 4:
441
- print("❌ Failed Rule 9: Excessive subdomains")
442
- return True, "🌳 Excessive subdomains"
443
- else:
444
- print("✅ Passed Rule 9: Subdomain count is normal")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
- if re.match(r'^\d{1,3}(-\d{1,3}){3}$', domain_name.replace('.', '-')):
447
- print("❌ Failed Rule 10: Domain name formatted like an IP")
448
- return True, "🔢 Domain name formatted like an IP"
449
- else:
450
- print("✅ Passed Rule 10: Domain name is not IP-like")
451
-
452
- phishing_keywords = [
453
- 'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
454
- 'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
455
- 'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
456
- 'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
457
- 'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
458
- ]
459
- full_url_parts = url + path + query + fragment
460
- if any(keyword in full_url_parts for keyword in phishing_keywords):
461
- print("❌ Failed Rule 11: Contains phishing keyword")
462
- return True, "🔍 Contains phishing keyword"
463
- else:
464
- print("✅ Passed Rule 11: No phishing keywords found")
465
 
466
- if len(path) > 100:
467
- print("❌ Failed Rule 12: Very long path")
468
- return True, "📜 Very long URL path"
469
- else:
470
- print("✅ Passed Rule 12: Path length is acceptable")
471
 
472
- suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
473
- if any(path.endswith(ext) for ext in suspicious_extensions):
474
- print("❌ Failed Rule 13: Suspicious file extension")
475
- return True, "📁 Suspicious file extension in path"
476
- else:
477
- print("✅ Passed Rule 13: No suspicious file extension")
478
 
479
- if any(param in query for param in ['redirect=', 'url=', 'goto=', 'link=']):
480
- print("❌ Failed Rule 14: Redirect pattern in query")
481
- return True, "🔗 Potential redirect link"
482
- else:
483
- print("✅ Passed Rule 14: No redirect pattern in query")
484
-
485
- # Gibberish Check
486
- parts = re.split(r'[\/\.\-\_\?\=\&]', url)
487
- long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
488
- gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
489
- if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
490
- print("❌ Failed Rule 15: Mostly gibberish words")
491
- return True, "🧾 Mostly gibberish / non-dictionary words"
492
- else:
493
- print("✅ Passed Rule 15: Words are mostly valid")
494
 
495
- print("✅ All rule-based checks passed")
496
- return False, None
497
-
498
- #end of navis baiyes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
 
500
 
501
 
@@ -800,43 +999,58 @@ def run_svr_demo():
800
 
801
  def clean_text(text):
802
  return text.lower().strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
803
 
804
  @app.route('/logistic', methods=['GET', 'POST'])
805
  def logistic():
806
- prediction = None
807
- confidence_percentage = None
808
- cleaned = None
809
- tokens = None
810
- vector = None
811
- probability = None
812
 
813
  if request.method == "POST":
 
 
 
 
814
  try:
815
- msg = request.form.get('message', '')
816
- cleaned = clean_text(msg)
817
  vector = vectorizer.transform([cleaned])
818
  probability = model.predict_proba(vector)[0][1]
819
  prediction = "Spam" if probability >= 0.5 else "Not Spam"
820
  confidence_percentage = round(probability * 100, 2)
821
  except Exception as e:
822
- print("Error in /logistic:", e)
823
  prediction = "Error"
 
 
 
 
 
 
 
 
 
 
 
824
 
825
- return render_template("logistic.html",
826
- prediction=prediction,
827
- confidence_percentage=confidence_percentage,
828
- cleaned=cleaned,
829
- tokens=cleaned.split() if cleaned else [],
830
- vector=vector.toarray().tolist() if vector is not None else [],
831
- probability=round(probability, 4) if probability else None,
832
- source="form")
833
-
834
  @app.route('/logistic-sms', methods=['POST'])
835
  def logistic_sms():
836
  try:
837
  data = request.get_json()
838
  msg = data.get('message', '')
839
  cleaned = clean_text(msg)
 
 
840
  vector = vectorizer.transform([cleaned])
841
  probability = model.predict_proba(vector)[0][1]
842
  prediction = "Spam" if probability >= 0.5 else "Not Spam"
@@ -847,8 +1061,7 @@ def logistic_sms():
847
  "confidence": confidence_percentage,
848
  "probability": round(probability, 4),
849
  "cleaned": cleaned,
850
- "tokens": cleaned.split(),
851
- "vector": vector.toarray().tolist(),
852
  "source": "json"
853
  })
854
 
@@ -1247,112 +1460,196 @@ def dt_visual_predict():
1247
 
1248
  # --- Naive Bayes Routes ---
1249
 
 
 
 
1250
 
 
 
1251
 
 
 
 
 
 
 
 
1252
 
1253
 
1254
  @app.route('/nb_spam')
1255
  def nb_spam_page():
1256
  return render_template('NB_spam.html')
1257
 
1258
- @app.route("/predict", methods=["POST"])
1259
- def predict():
1260
- try:
1261
- import re
1262
- from urllib.parse import urlparse
1263
- from spellchecker import SpellChecker
1264
 
1265
- import wordninja
 
 
 
1266
 
1267
- data = request.get_json()
1268
- url = data.get("url")
1269
 
1270
- if not url:
1271
- print("❌ No URL provided in request")
1272
- return jsonify({'error': 'No URL provided'}), 400
1273
 
1274
- print(f"\n🌐 Checking URL: {url}")
 
 
 
1275
 
1276
- # 1. ✅ VirusTotal
1277
- vt_flagged, vt_reason = check_with_virustotal(url)
1278
- if vt_flagged:
1279
- print(f"☣️ VirusTotal flagged it as malicious: {vt_reason}")
1280
- return jsonify({'prediction': 1, 'reason': vt_reason})
1281
 
1282
- print("✅ VirusTotal check passed")
 
 
1283
 
1284
- # 2. ✅ Rule-based
1285
- rule_flagged, rule_reason = is_rule_based_spam(url)
1286
- if rule_flagged:
1287
- print(f"📛 Rule-based detection triggered: {rule_reason}")
1288
- return jsonify({'prediction': 1, 'reason': rule_reason})
1289
 
1290
- print("✅ Rule-based checks passed")
 
 
 
 
1291
 
1292
- # 3. ML Prediction
1293
- features = vectorizer.transform([url])
1294
- prediction = nb_model.predict(features)[0]
1295
 
1296
- print(f"📊 ML Model predicted: {'SPAM' if prediction == 1 else 'SAFE'} (prediction = {prediction})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1297
 
1298
- # 4️⃣ Spelling Checker 🔍
1299
  spell = SpellChecker(distance=1)
1300
 
1301
- # Load custom dictionary from words_alpha
1302
- with open("data/words_alpha.txt", "r") as f:
1303
- dictionary_words = set(line.strip().lower() for line in f if line.strip())
1304
- spell.word_frequency.load_words(dictionary_words)
1305
-
1306
- # --- Word Extraction and Spell Check ---
1307
- def extract_words(url, dictionary_words):
1308
- parsed = urlparse(url)
1309
- parts = re.split(r'\W+', parsed.netloc + parsed.path)
1310
- seen = set()
1311
- final_words = []
1312
- final_log = []
1313
-
1314
- for word in parts:
1315
- if len(word) > 3 and word.isalpha():
1316
- split_words = wordninja.split(word.lower())
1317
-
1318
- if len(split_words) <= 1:
1319
- split_words = [word.lower()]
1320
-
1321
- for w in split_words:
1322
- if len(w) > 2 and w not in seen:
1323
- seen.add(w)
1324
- final_words.append(w)
1325
- final_log.append({
1326
- "word": w,
1327
- "valid": w in dictionary_words
1328
- })
1329
-
1330
- return final_words, final_log
1331
-
1332
- # Run extraction and get spelling log
1333
- words, spell_log = extract_words(url, dictionary_words)
1334
- misspelled = [entry["word"] for entry in spell_log if not entry["valid"]]
1335
-
1336
- # If ML says safe but spell check has typos → override
1337
- if prediction == 0 and misspelled:
1338
- print("⚠️ Spelling Mismatch: CSV said Safe, but typos found:", misspelled)
1339
  return jsonify({
1340
- 'prediction': 1,
1341
- 'reason': f"⚠️ Spelling mismatch: {', '.join(misspelled)}",
1342
- 'steps': spell_log
 
 
 
 
 
 
1343
  })
1344
-
1345
- # ✅ Final Safe/Spam Decision
1346
- return jsonify({
1347
- 'prediction': int(prediction),
1348
- 'reason': "✅ Passed all checks" if prediction == 0 else "🧾 ML model flagged it",
1349
- 'steps': spell_log
1350
- })
1351
 
1352
  except Exception as e:
1353
- print(f" ERROR in /predict: {e}")
1354
- return jsonify({'error': str(e)}), 500
1355
-
1356
 
1357
 
1358
 
@@ -1550,4 +1847,4 @@ def DBSCAN():
1550
 
1551
  if __name__ == '__main__':
1552
  #app.run(debug=True, port=5000)
1553
- app.run(debug=True)
 
38
  import os
39
  from urllib.parse import urlparse
40
  import tldextract
41
+ import string
42
+
43
+
44
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
45
+
46
+ model_name = "microsoft/deberta-v3-small"
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
49
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
50
+
51
+ bert_checker = pipeline("text-classification", model=model, tokenizer=tokenizer)
52
+
53
  # Load environment variables from .env
54
  load_dotenv()
55
  #spam url import relateted
 
95
 
96
 
97
  #huggung face code start
98
+ # from huggingface_hub import hf_hub_download
99
+ # import joblib
100
+ # import numpy as np
101
+ # import torch
102
+
103
+ # REPO_ID = "deedrop1140/my-ml-models"
104
+
105
+ # def load_file(filename):
106
+ # """Download a file from Hugging Face Hub and load it with the right library."""
107
+ # file_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
108
+
109
+ # if filename.endswith(".pkl") or filename.endswith(".joblib"):
110
+ # return joblib.load(file_path)
111
+ # elif filename.endswith(".npy"):
112
+ # return np.load(file_path, allow_pickle=True)
113
+ # elif filename.endswith(".pt") or filename.endswith(".pth"):
114
+ # return torch.load(file_path)
115
+ # else:
116
+ # return file_path
117
+
118
+ # # =====================
119
+ # # Replace your old model loads with this:
120
+ # # =====================
121
+
122
+ # # Models
123
+ # knn_model = load_file("Models/knn_model.pkl")
124
+ # lasso_model = load_file("Models/lasso_model.pkl")
125
+ # liar_model = load_file("Models/liar_model.joblib")
126
+ # linear_model = load_file("Models/linear_model.pkl")
127
+ # logistic_model = load_file("Models/logistic_model.pkl")
128
+ # nb_url_model = load_file("Models/nb_url_model.pkl")
129
+ # poly_model = load_file("Models/poly_model.pkl")
130
+ # rf_model = load_file("Models/rf_model.pkl")
131
+ # ridge_model = load_file("Models/ridge_model.pkl")
132
+ # supervised_model = load_file("Models/supervised_model.pkl")
133
+ # svr_model = load_file("Models/svr_model.pkl")
134
+ # voting_url_model = load_file("Models/voting_url_model.pkl")
135
+
136
+ # # Vectorizers / Encoders / Scalers
137
+ # label_classes = load_file("Models/label_classes.npy")
138
+ # label_encoder = load_file("Models/label_encoder.pkl")
139
+ # lasso_scaler = load_file("Models/lasso_scaler.pkl")
140
+ # liar_vectorizer = load_file("Models/liar_vectorizer.joblib")
141
+ # nb_url_vectorizer = load_file("Models/nb_url_vectorizer.pkl")
142
+ # poly_transform = load_file("Models/poly_transform.pkl")
143
+ # ridge_scaler = load_file("Models/ridge_scaler.pkl")
144
+ # svr_scaler_X = load_file("Models/svr_scaler_X.pkl")
145
+ # svr_scaler_y = load_file("Models/svr_scaler_y.pkl")
146
+ # tfidf_vectorizer = load_file("Models/tfidf_vectorizer.pkl")
147
+ # url_vectorizer = load_file("Models/url_vectorizer.pkl")
148
+ # vectorizer_joblib = load_file("Models/vectorizer.joblib")
149
+ # vectorizer_pkl = load_file("Models/vectorizer.pkl")
150
+ # # huggung face code end
151
 
152
  MODEL_DIR = "Models"
153
  DATA_DIR = "housedata" # Assuming your house data is here
 
167
  return response.text
168
 
169
  #rfc
170
+ # model = load("Models/liar_model.joblib")
171
  vectorizer = load("Models/liar_vectorizer.joblib")
172
 
173
  # Load BERT fact-checker pipeline (local model)
 
249
  loaded_models = {}
250
 
251
  # Load logistic model and vectorizer for SMS
252
+ vectorizer = joblib.load("Models/logvectorizer.pkl")
253
  model = joblib.load("Models/logistic_model.pkl")
254
 
255
+
256
+
257
  # Load models once NB+DT+SVM is trained
258
+ try:
259
+ vectorizer = joblib.load("Models/logvectorizer.pkl")
260
+ model = joblib.load("Models/logistic_model.pkl")
261
+ print("✅ Model and vectorizer loaded into memory successfully!")
262
+ except Exception as e:
263
+ vectorizer = None
264
+ model = None
265
+ print(f"❌ Error: Could not load model or vectorizer. Please check your file paths. Error: {e}")
266
  #END NB+DT+SVM
267
 
268
  # === Naive Bayes URL Spam Classifier (NB_spam.html) ===
269
  # === Load Model & Vectorizer ===
270
+
271
+
272
+
273
  VT_API_KEY = os.getenv("VT_API_KEY")
274
 
275
  model_path = os.path.join("Models", "nb_url_model.pkl")
 
289
 
290
 
291
  # Load dictionary words
292
+ # valid_words = set(words.words())
293
 
294
+ # def load_trusted_keywords(file_path):
295
+ # with open(file_path, 'r', encoding='utf-8') as f:
296
+ # return set(line.strip().lower() for line in f if line.strip())
297
 
298
+ # # # Load trusted colleges from file
299
+ # # with open("data/trusted_colleges.txt", "r") as f:
300
+ # # trusted_colleges = set(line.strip().lower() for line in f if line.strip())
301
 
302
 
303
 
304
+ # whitelist = set([
305
+ # # Search Engines
306
+ # 'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
307
 
308
+ # # Social Media
309
+ # 'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
310
+ # 'threads', 'pinterest', 'reddit', 'quora',
311
 
312
+ # # Communication Tools
313
+ # 'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
314
+ # 'teams', 'signal', 'messenger',
315
 
316
+ # # Global E-commerce
317
+ # 'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
318
+ # 'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
319
 
320
+ # # Indian E-commerce / Services
321
+ # 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
322
+ # 'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
323
+ # 'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit','https://universalcollegeofengineering.edu.in',
324
 
325
+ # # Education / Productivity
326
+ # 'youtube', 'docs', 'drive', 'calendar', 'photos', 'zoom',
327
+ # 'gmail', 'notion', 'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
328
 
329
+ # # News / Media / Tech
330
+ # 'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
331
+ # 'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
332
+ # 'techcrunch', 'verge', 'wired',
333
 
334
+ # # Streaming / Entertainment
335
+ # 'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
336
 
337
+ # # Dev & Tools
338
+ # 'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
339
+ # 'adobe', 'figma', 'canva',
340
 
341
+ # # Financial / Banking
342
+ # 'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
343
+ # 'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
344
 
345
+ # # Government / Utilities
346
+ # 'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
347
 
348
+ # # Others Common
349
+ # 'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
350
+ # 'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
351
+ # ])
352
 
353
 
354
+ # def is_gibberish_word(word):
355
+ # word = word.lower()
356
+ # if len(word) < 4:
357
+ # return False
358
+ # if not word.isalpha():
359
+ # return True
360
+ # return word not in valid_words
361
 
362
+ # def is_rule_based_spam(url):
363
+ # url = url.strip().lower()
364
+ # print(f"\n🌐 Checking URL: {url}")
365
 
366
+ # try:
367
+ # parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
368
+ # domain = parsed.netloc
369
+ # path = parsed.path
370
+ # query = parsed.query
371
+ # fragment = parsed.fragment
372
+ # except Exception as e:
373
+ # print("❌ Failed: Malformed URL")
374
+ # return True, f"❌ Malformed URL: {e}"
375
 
376
+ # if not domain:
377
+ # print("❌ Failed: Empty domain after parsing")
378
+ # return True, "❌ Empty domain after parsing"
379
+ # else:
380
+ # print("✅ Parsed domain:", domain)
381
 
382
+ # # --- Rules ---
383
 
384
+ # if '.' not in domain:
385
+ # print("❌ Failed Rule 1: Domain missing dot (.)")
386
+ # return True, "❌ Domain missing dot (.)"
387
+ # else:
388
+ # print("✅ Passed Rule 1: Domain contains dot")
389
 
390
+ # trusted_tlds = ['.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int', '.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in']
391
+ # if any(tld in domain for tld in trusted_tlds):
392
+ # print("✅ Passed Rule 2: Trusted TLD")
393
+ # else:
394
+ # print("✅ Passed Rule 2: Not a trusted TLD (but not blocked yet)")
395
 
396
+ # try:
397
+ # ext = tldextract.extract(url)
398
+ # domain_name = ext.domain
399
+ # suffix = ext.suffix
400
+ # print(f"✅ Extracted domain name: {domain_name}, suffix: {suffix}")
401
+ # except Exception:
402
+ # print("❌ Failed: Cannot extract domain/suffix")
403
+ # return True, "❌ Cannot extract domain/suffix"
404
+
 
 
 
 
 
405
 
406
+ # if domain_name in whitelist:
407
+ # print("✅ Skipping gibberish check for whitelisted domain")
408
+ # else:
409
+ # parts = re.split(r'[\/\.\-\_\?\=\&]', url)
410
+ # long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
411
+ # gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
412
 
413
+ # if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
414
+ # print("❌ Failed Rule 15: Mostly gibberish words")
415
+ # return True, "🧾 Mostly gibberish / non-dictionary words"
416
+ # else:
417
+ # print("✅ Passed Rule 15: Words are mostly valid")
418
+
419
+ # if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain):
420
+ # print("❌ Failed Rule 3: IP address as domain")
421
+ # return True, "📟 IP address instead of domain"
422
+ # else:
423
+ # print("✅ Passed Rule 3: Domain is not an IP address")
424
+
425
+ # bad_tlds = ['.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn', '.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science', '.stream', '.webcam', '.online', '.site', '.website', '.space', '.club', '.buzz', '.info']
426
+ # if any(suffix == tld.lstrip('.') for tld in bad_tlds):
427
+ # print(f"❌ Failed Rule 4: Suspicious TLD ({suffix})")
428
+ # return True, "🧨 Suspicious top-level domain"
429
+ # else:
430
+ # print("✅ Passed Rule 4: TLD not in suspicious list")
431
+
432
+ # if len(domain_name) > 30:
433
+ # print("❌ Failed Rule 5: Very long domain name")
434
+ # return True, "📏 Very long and unrecognized domain name"
435
+ # else:
436
+ # print("✅ Passed Rule 5: Domain name length is acceptable")
437
+
438
+ # numeric_chars = sum(c.isdigit() for c in domain_name)
439
+ # if len(domain_name) > 5 and (numeric_chars / len(domain_name)) > 0.5:
440
+ # print("❌ Failed Rule 6: Numeric-heavy domain")
441
+ # return True, "🔢 Numeric-heavy domain name"
442
+ # else:
443
+ # print("✅ Passed Rule 6: Domain has few or no digits")
444
+
445
+ # if domain_name.count('-') > 3 or re.search(r'[!@#$%^&*()_+={}\[\]|\\:;"\'<>,?/`~]', domain_name):
446
+ # print("❌ Failed Rule 7: Too many special characters")
447
+ # return True, "➖ Excessive hyphens or special characters in domain"
448
+ # else:
449
+ # print("✅ Passed Rule 7: No excessive special characters")
450
+
451
+ # if domain_name.startswith('xn--'):
452
+ # print("❌ Failed Rule 8: Punycode detected")
453
+ # return True, "🌐 Punycode detected (potential homograph attack)"
454
+ # else:
455
+ # print("✅ Passed Rule 8: No punycode")
456
+
457
+ # subdomains = ext.subdomain.split('.') if ext.subdomain else []
458
+ # if len(subdomains) > 4:
459
+ # print("❌ Failed Rule 9: Excessive subdomains")
460
+ # return True, "🌳 Excessive subdomains"
461
+ # else:
462
+ # print("✅ Passed Rule 9: Subdomain count is normal")
463
+
464
+ # if re.match(r'^\d{1,3}(-\d{1,3}){3}$', domain_name.replace('.', '-')):
465
+ # print("❌ Failed Rule 10: Domain name formatted like an IP")
466
+ # return True, "🔢 Domain name formatted like an IP"
467
+ # else:
468
+ # print("✅ Passed Rule 10: Domain name is not IP-like")
469
+
470
+ # phishing_keywords = [
471
+ # 'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
472
+ # 'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
473
+ # 'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
474
+ # 'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
475
+ # 'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
476
+ # ]
477
+ # full_url_parts = url + path + query + fragment
478
+ # if any(keyword in full_url_parts for keyword in phishing_keywords):
479
+ # print("❌ Failed Rule 11: Contains phishing keyword")
480
+ # return True, "🔍 Contains phishing keyword"
481
+ # else:
482
+ # print("✅ Passed Rule 11: No phishing keywords found")
483
+
484
+ # if len(path) > 100:
485
+ # print("❌ Failed Rule 12: Very long path")
486
+ # return True, "📜 Very long URL path"
487
+ # else:
488
+ # print("✅ Passed Rule 12: Path length is acceptable")
489
+
490
+ # suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
491
+ # if any(path.endswith(ext) for ext in suspicious_extensions):
492
+ # print("❌ Failed Rule 13: Suspicious file extension")
493
+ # return True, "📁 Suspicious file extension in path"
494
+ # else:
495
+ # print("✅ Passed Rule 13: No suspicious file extension")
496
+
497
+ # if any(param in query for param in ['redirect=', 'url=', 'goto=', 'link=']):
498
+ # print("❌ Failed Rule 14: Redirect pattern in query")
499
+ # return True, "🔗 Potential redirect link"
500
+ # else:
501
+ # print("✅ Passed Rule 14: No redirect pattern in query")
502
+
503
+ # # Gibberish Check
504
+ # parts = re.split(r'[\/\.\-\_\?\=\&]', url)
505
+ # long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
506
+ # gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
507
+ # if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
508
+ # print("❌ Failed Rule 15: Mostly gibberish words")
509
+ # return True, "🧾 Mostly gibberish / non-dictionary words"
510
+ # else:
511
+ # print("✅ Passed Rule 15: Words are mostly valid")
512
+
513
+ # print("✅ All rule-based checks passed")
514
+ # return False, None
515
 
516
+ #end of navis baiyes
517
+ #start of navi# --- Dictionary Words ---
518
+ # valid_words = set(words.words())
519
+
520
+ # # --- Load Trusted Keywords ---
521
+ # def load_trusted_keywords(file_path):
522
+ # with open(file_path, 'r', encoding='utf-8') as f:
523
+ # return set(line.strip().lower() for line in f if line.strip())
524
+
525
+ # # --- Whitelist (common safe domains/services) ---
526
+ # whitelist = set([
527
+ # # Search Engines
528
+ # 'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
529
+
530
+ # # Social Media
531
+ # 'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
532
+ # 'threads', 'pinterest', 'reddit', 'quora',
533
+
534
+ # # Communication Tools
535
+ # 'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
536
+ # 'teams', 'signal', 'messenger',
537
+
538
+ # # Global E-commerce
539
+ # 'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
540
+ # 'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
541
+
542
+ # # Indian E-commerce / Services
543
+ # 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
544
+ # 'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
545
+ # 'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit',
546
+ # 'universalcollegeofengineering',
547
+
548
+ # # Education / Productivity
549
+ # 'youtube', 'docs', 'drive', 'calendar', 'photos', 'gmail', 'notion',
550
+ # 'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
551
+
552
+ # # News / Media / Tech
553
+ # 'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
554
+ # 'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
555
+ # 'techcrunch', 'verge', 'wired',
556
+
557
+ # # Streaming / Entertainment
558
+ # 'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
559
+
560
+ # # Dev & Tools
561
+ # 'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
562
+ # 'adobe', 'figma', 'canva',
563
+
564
+ # # Financial / Banking
565
+ # 'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
566
+ # 'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
567
+
568
+ # # Government / Utilities
569
+ # 'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
570
+
571
+ # # Others Common
572
+ # 'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
573
+ # 'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
574
+ # ])
575
+
576
+ # # --- Gibberish Check Helper ---
577
+ # def is_gibberish_word(word):
578
+ # word = word.lower()
579
+ # if len(word) < 4:
580
+ # return False
581
+ # if not word.isalpha():
582
+ # return True
583
+ # return word not in valid_words
584
+
585
+
586
+ # # --- RULE BASED CHECK ---
587
+ # def is_rule_based_spam(url, skip_gibberish=False):
588
+ # url = url.strip().lower()
589
+ # print(f"\n🌐 Checking URL: {url}")
590
 
591
+ # try:
592
+ # parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
593
+ # domain = parsed.netloc
594
+ # path = parsed.path
595
+ # query = parsed.query
596
+ # fragment = parsed.fragment
597
+ # except Exception as e:
598
+ # return True, f"❌ Malformed URL: {e}"
 
 
 
 
 
 
 
 
 
 
 
599
 
600
+ # if not domain:
601
+ # return True, "❌ Empty domain after parsing"
 
 
 
602
 
603
+ # # Rule 1: Dot in domain
604
+ # if '.' not in domain:
605
+ # return True, "❌ Domain missing dot (.)"
 
 
 
606
 
607
+ # # Trusted TLDs
608
+ # trusted_tlds = [
609
+ # '.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int',
610
+ # '.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in'
611
+ # ]
 
 
 
 
 
 
 
 
 
 
612
 
613
+ # try:
614
+ # ext = tldextract.extract(url)
615
+ # domain_name = ext.domain
616
+ # suffix = ext.suffix
617
+ # subdomains = ext.subdomain.split('.') if ext.subdomain else []
618
+ # except Exception:
619
+ # return True, "❌ Cannot extract domain/suffix"
620
+
621
+ # # --- WHITELIST / TRUSTED SKIP ---
622
+ # if any(tld in domain for tld in trusted_tlds) or domain_name in whitelist:
623
+ # print("✅ Trusted/whitelisted → gibberish will be skipped")
624
+ # skip_gibberish = True
625
+
626
+ # # Rule 3: IP as domain
627
+ # if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain):
628
+ # return True, "📟 IP address instead of domain"
629
+
630
+ # # Rule 4: Bad TLD
631
+ # bad_tlds = ['.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn',
632
+ # '.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science',
633
+ # '.stream', '.webcam', '.online', '.site', '.website', '.space',
634
+ # '.club', '.buzz', '.info']
635
+ # if any(suffix == tld.lstrip('.') for tld in bad_tlds):
636
+ # return True, "🧨 Suspicious top-level domain"
637
+
638
+ # # Rule 5: Long domain
639
+ # if len(domain_name) > 30:
640
+ # return True, "📏 Very long and unrecognized domain name"
641
+
642
+ # # Rule 6: Numeric-heavy
643
+ # numeric_chars = sum(c.isdigit() for c in domain_name)
644
+ # if len(domain_name) > 5 and (numeric_chars / len(domain_name)) > 0.5:
645
+ # return True, "🔢 Numeric-heavy domain name"
646
+
647
+ # # Rule 7: Special characters
648
+ # if domain_name.count('-') > 3 or re.search(r'[!@#$%^&*()_+={}\[\]|\\:;"\'<>,?/`~]', domain_name):
649
+ # return True, "➖ Excessive hyphens or special characters in domain"
650
+
651
+ # # Rule 8: Punycode
652
+ # if domain_name.startswith('xn--'):
653
+ # return True, "🌐 Punycode detected (potential homograph attack)"
654
+
655
+ # # Rule 9: Excessive subdomains
656
+ # if len(subdomains) > 4:
657
+ # return True, "🌳 Excessive subdomains"
658
+
659
+ # # Rule 10: Domain looks like IP
660
+ # if re.match(r'^\d{1,3}(-\d{1,3}){3}$', domain_name.replace('.', '-')):
661
+ # return True, "🔢 Domain name formatted like an IP"
662
+
663
+ # # Rule 11: Phishing keywords
664
+ # phishing_keywords = [
665
+ # 'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
666
+ # 'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
667
+ # 'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
668
+ # 'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
669
+ # 'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
670
+ # ]
671
+ # full_url_parts = url + path + query + fragment
672
+ # if any(keyword in full_url_parts for keyword in phishing_keywords):
673
+ # return True, "🔍 Contains phishing keyword"
674
+
675
+ # # Rule 12: Long path
676
+ # if len(path) > 100:
677
+ # return True, "📜 Very long URL path"
678
+
679
+ # # Rule 13: Suspicious file extensions
680
+ # suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
681
+ # if any(path.endswith(ext) for ext in suspicious_extensions):
682
+ # return True, "📁 Suspicious file extension in path"
683
+
684
+ # # Rule 14: Redirect in query
685
+ # if any(param in query for param in ['redirect=', 'url=', 'goto=', 'link=']):
686
+ # return True, "🔗 Potential redirect link"
687
+
688
+ # # Rule 15: Gibberish (only if not skipped)
689
+ # if not skip_gibberish:
690
+ # parts = re.split(r'[\/\.\-\_\?\=\&]', url)
691
+ # long_parts = [p for p in parts if len(p) >= 5 and p.isalpha()]
692
+ # gibberish_parts = [p for p in long_parts if is_gibberish_word(p)]
693
+ # if len(long_parts) > 0 and (len(gibberish_parts) / len(long_parts)) > 0.6:
694
+ # return True, "🧾 Mostly gibberish / non-dictionary words"
695
+
696
+ # return False, None
697
+ # #end of navbaiesd
698
 
699
 
700
 
 
999
 
1000
  def clean_text(text):
1001
  return text.lower().strip()
1002
+
1003
+ import re
1004
+
1005
+ # Load saved model and vectorizer
1006
+ model = joblib.load("Models/logistic_model.pkl")
1007
+ vectorizer = joblib.load("Models/logvectorizer.pkl")
1008
+
1009
+ # Text cleaning
1010
+ def clean_text(text):
1011
+ text = text.lower()
1012
+ text = re.sub(r'\W', ' ', text)
1013
+ text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
1014
+ text = re.sub(r'\s+', ' ', text)
1015
+ return text.strip()
1016
 
1017
  @app.route('/logistic', methods=['GET', 'POST'])
1018
  def logistic():
1019
+ prediction, confidence_percentage, cleaned, tokens, probability = None, None, None, None, None
 
 
 
 
 
1020
 
1021
  if request.method == "POST":
1022
+ msg = request.form.get('message', '')
1023
+ cleaned = clean_text(msg)
1024
+ tokens = cleaned.split()
1025
+
1026
  try:
 
 
1027
  vector = vectorizer.transform([cleaned])
1028
  probability = model.predict_proba(vector)[0][1]
1029
  prediction = "Spam" if probability >= 0.5 else "Not Spam"
1030
  confidence_percentage = round(probability * 100, 2)
1031
  except Exception as e:
1032
+ print("Error predicting:", e)
1033
  prediction = "Error"
1034
+ confidence_percentage = 0
1035
+
1036
+ return render_template(
1037
+ "logistic.html",
1038
+ prediction=prediction,
1039
+ confidence_percentage=confidence_percentage,
1040
+ cleaned=cleaned,
1041
+ tokens=tokens,
1042
+ probability=round(probability, 4) if probability else None,
1043
+ source="sms"
1044
+ )
1045
 
 
 
 
 
 
 
 
 
 
1046
  @app.route('/logistic-sms', methods=['POST'])
1047
  def logistic_sms():
1048
  try:
1049
  data = request.get_json()
1050
  msg = data.get('message', '')
1051
  cleaned = clean_text(msg)
1052
+ tokens = cleaned.split()
1053
+
1054
  vector = vectorizer.transform([cleaned])
1055
  probability = model.predict_proba(vector)[0][1]
1056
  prediction = "Spam" if probability >= 0.5 else "Not Spam"
 
1061
  "confidence": confidence_percentage,
1062
  "probability": round(probability, 4),
1063
  "cleaned": cleaned,
1064
+ "tokens": tokens,
 
1065
  "source": "json"
1066
  })
1067
 
 
1460
 
1461
  # --- Naive Bayes Routes ---
1462
 
1463
+ from urllib.parse import urlparse
1464
+ from sklearn.naive_bayes import GaussianNB
1465
+ from nltk.corpus import words
1466
 
1467
+ model_path = "Models/nb_url_model.pkl"
1468
+ vectorizer_path = "Models/nb_url_vectorizer.pkl"
1469
 
1470
+ if os.path.exists(model_path) and os.path.exists(vectorizer_path):
1471
+ nb_model = joblib.load(model_path)
1472
+ vectorizer = joblib.load(vectorizer_path)
1473
+ print("✅ Loaded Naive Bayes URL model")
1474
+ else:
1475
+ nb_model, vectorizer = None, None
1476
+ print("❌ Model/vectorizer not found")
1477
 
1478
 
1479
  @app.route('/nb_spam')
1480
  def nb_spam_page():
1481
  return render_template('NB_spam.html')
1482
 
 
 
 
 
 
 
1483
 
1484
+ import re
1485
+ from urllib.parse import urlparse
1486
+ from spellchecker import SpellChecker
1487
+ import wordninja
1488
 
 
 
1489
 
 
 
 
1490
 
1491
+ # ---- Whitelist (your full one, unchanged) ----
1492
+ whitelist = set([
1493
+ # Search Engines
1494
+ 'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
1495
 
1496
+ # Social Media
1497
+ 'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
1498
+ 'threads', 'pinterest', 'reddit', 'quora',
 
 
1499
 
1500
+ # Communication Tools
1501
+ 'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
1502
+ 'teams', 'signal', 'messenger',
1503
 
1504
+ # Global E-commerce
1505
+ 'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
1506
+ 'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
 
 
1507
 
1508
+ # Indian E-commerce / Services
1509
+ 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
1510
+ 'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
1511
+ 'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit',
1512
+ 'universalcollegeofengineering',
1513
 
1514
+ # Education / Productivity
1515
+ 'youtube', 'docs', 'drive', 'calendar', 'photos', 'gmail', 'notion',
1516
+ 'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
1517
 
1518
+ # News / Media / Tech
1519
+ 'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
1520
+ 'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
1521
+ 'techcrunch', 'verge', 'wired',
1522
+
1523
+ # Streaming / Entertainment
1524
+ 'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
1525
+
1526
+ # Dev & Tools
1527
+ 'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
1528
+ 'adobe', 'figma', 'canva',
1529
+
1530
+ # Financial / Banking
1531
+ 'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
1532
+ 'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
1533
+
1534
+ # Government / Utilities
1535
+ 'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
1536
+
1537
+ # Others Common
1538
+ 'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
1539
+ 'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
1540
+ ])
1541
+
1542
+ # ... your full whitelist from before ...
1543
+
1544
+
1545
+ # ---- Trusted & Bad TLDs ----
1546
+ trusted_tlds = [
1547
+ '.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int',
1548
+ '.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in'
1549
+ ]
1550
+
1551
+ # Expanded Bad TLDs (Rule 4)
1552
+ bad_tlds = [
1553
+ '.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn',
1554
+ '.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science',
1555
+ '.stream', '.webcam', '.online', '.site', '.website', '.space',
1556
+ '.club', '.buzz', '.info'
1557
+ ]
1558
+
1559
+ # Suspicious extensions (Rule 13)
1560
+ suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
1561
+
1562
+ # Phishing keywords (Rule 11, your full list)
1563
+ phishing_keywords = [
1564
+ 'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
1565
+ 'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
1566
+ 'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
1567
+ 'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
1568
+ 'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
1569
+ ]
1570
+
1571
+ # ---- Rules 5–14 ----
1572
+ rules = {
1573
+ 5: r"https?://\d{1,3}(\.\d{1,3}){3}",
1574
+ 6: r"@[A-Za-z0-9.-]+\.[A-Za-z]{2,}",
1575
+ 7: r"(free money|win now|click here)",
1576
+ 8: r"https?://[^\s]*\.(ru|cn|tk)",
1577
+ 9: r"https?://.{0,6}\..{2,6}/.{0,6}",
1578
+ 10: r"[0-9]{10,}",
1579
+ 12: r"https?://[^\s]*@[^\s]+",
1580
+ 13: r"https?://[^\s]*//[^\s]+",
1581
+ 14: r"https?://[^\s]*\?(?:[^=]+=[^&]*&){5,}",
1582
+ }
1583
+
1584
+
1585
+ # ---- Gibberish Check Helper (Rule 15) ----
1586
+ def is_gibberish_word(word):
1587
+ vowels = "aeiou"
1588
+ v_count = sum(c in vowels for c in word)
1589
+ return v_count / len(word) < 0.25
1590
+
1591
+ # # ---- Utility: Extract words from URL ----
1592
+ # def extract_words(url):
1593
+ # parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
1594
+ # raw = parsed.netloc.replace('-', '') + parsed.path.replace('-', '')
1595
+ # # Split using wordninja
1596
+ # words = wordninja.split(raw.lower())
1597
+ # # Keep only alphabetic words of length >= 3
1598
+ # words = [w for w in words if w.isalpha() and len(w) >= 3]
1599
+ # return words
1600
+ # ---- Extract words from URL ----
1601
+ def extract_words(url):
1602
+ parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
1603
+ parts = re.split(r'\W+', parsed.netloc + parsed.path)
1604
+ final_words = []
1605
+ for word in parts:
1606
+ if len(word) > 2 and word.isalpha():
1607
+ split_words = wordninja.split(word.lower())
1608
+ if len(split_words) <= 1:
1609
+ split_words = [word.lower()]
1610
+ final_words.extend(split_words)
1611
+ return final_words
1612
+
1613
+
1614
+ # --- Your original predict function, now inside the Flask app ---
1615
+ @app.route("/predict", methods=["POST"])
1616
+ def predict():
1617
+ try:
1618
+ data = request.get_json()
1619
+ url = data.get("url", "").lower()
1620
+ if not url:
1621
+ return jsonify({'error': 'No URL provided'}), 400
1622
+
1623
+ parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
1624
+ path = parsed.path
1625
 
1626
+ # ---- SpellChecker using built-in dictionary ----
1627
  spell = SpellChecker(distance=1)
1628
 
1629
+ # ---- Extract words and check spelling ----
1630
+ words = extract_words(url)
1631
+ # ignore known TLDs
1632
+ tlds_to_ignore = [tld.replace('.', '',"/") for tld in trusted_tlds + bad_tlds]
1633
+ words_for_spellcheck = [w for w in words if w not in tlds_to_ignore]
1634
+
1635
+ misspelled = spell.unknown(words_for_spellcheck)
1636
+ steps = [{"word": w, "valid": (w not in misspelled) or (w in tlds_to_ignore)} for w in words]
1637
+
1638
+ if misspelled:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1639
  return jsonify({
1640
+ "prediction": 1,
1641
+ "reason": f"🧾 Spelling errors: {', '.join(misspelled)}",
1642
+ "steps": steps
1643
+ })
1644
+ else:
1645
+ return jsonify({
1646
+ "prediction": 0,
1647
+ "reason": "✅ No spelling issues",
1648
+ "steps": steps
1649
  })
 
 
 
 
 
 
 
1650
 
1651
  except Exception as e:
1652
+ return jsonify({'error': f"An issue occurred during spell checking: {str(e)}"}), 500
 
 
1653
 
1654
 
1655
 
 
1847
 
1848
  if __name__ == '__main__':
1849
  #app.run(debug=True, port=5000)
1850
+ app.run(debug=True,use_reloader=False)
load_file.py CHANGED
@@ -8,15 +8,26 @@ load_dotenv()
8
  # Get token from environment
9
  HF_TOKEN = os.getenv("HF_TOKEN")
10
 
11
- # Login (only needed if you dont use huggingface-cli)
12
  login(token=HF_TOKEN)
13
 
14
- REPO_ID = "deedrop1140/my-ml-models"
15
 
16
  def load_file(filename):
17
- file_path = hf_hub_download(
18
- repo_id=REPO_ID,
19
- filename=filename,
20
- token=HF_TOKEN # token is loaded from environment
 
 
 
 
 
 
 
 
 
 
21
  )
22
  return file_path
 
 
8
  # Get token from environment
9
  HF_TOKEN = os.getenv("HF_TOKEN")
10
 
11
+ # Login (only needed if you don't use huggingface-cli)
12
  login(token=HF_TOKEN)
13
 
14
+ REPO_ID = "deedrop1140/Neroml" # Replace with your repository ID
15
 
16
  def load_file(filename):
17
+ """
18
+ Downloads a specified file from the Hugging Face Hub repository.
19
+
20
+ Args:
21
+ filename (str): The name of the file to download from the repository.
22
+
23
+ Returns:
24
+ str: The local path where the downloaded file is stored.
25
+ """
26
+
27
+ file_path = hf_hub_download(
28
+ repo_id=REPO_ID,
29
+ filename=filename,
30
+ token=HF_TOKEN # token is loaded from environment
31
  )
32
  return file_path
33
+
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
save_token.py CHANGED
@@ -2,4 +2,4 @@ import os
2
  from huggingface_hub import HfApi
3
 
4
  token = os.getenv("HF_TOKEN") # loaded from .env or system environment
5
- api = HfApi(token=token)
 
2
  from huggingface_hub import HfApi
3
 
4
  token = os.getenv("HF_TOKEN") # loaded from .env or system environment
5
+ api = HfApi(token=token)
train_logistic_model.py CHANGED
@@ -1,55 +1,47 @@
1
  import pandas as pd
2
  import re
3
- from sklearn.model_selection import train_test_split
 
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.linear_model import LogisticRegression
6
- import joblib
7
- import os
8
 
9
- # Load CSV
10
  df = pd.read_csv("data/spam.csv", encoding='latin-1')
11
-
12
- # Only keep the columns you need (for spam.csv structure)
13
  df = df[['v1', 'v2']]
14
  df.columns = ['label', 'message']
15
-
16
- # Drop duplicates
17
  df.drop_duplicates(inplace=True)
18
-
19
- # Fill missing values in messages with empty string (text can't use mean)
20
  df['message'] = df['message'].fillna("")
21
-
22
- # Fill missing values in label with mode (most common class)
23
  df['label'] = df['label'].fillna(df['label'].mode()[0])
24
 
25
- # Clean message text
26
  def clean_text(text):
27
- text = text.lower() # lowercase
28
- text = re.sub(r'\W', ' ', text) # remove non-words
29
- text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # remove single chars
30
- text = re.sub(r'\s+', ' ', text) # remove multiple spaces
31
  return text.strip()
32
 
33
  df['message'] = df['message'].apply(clean_text)
34
-
35
- # Label encoding: spam = 1, ham = 0
36
  df['label'] = df['label'].map({'ham': 0, 'spam': 1})
37
 
38
- # Split data
39
- X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)
 
 
40
 
41
- # Vectorization
42
- vectorizer = TfidfVectorizer()
43
  X_train_vec = vectorizer.fit_transform(X_train)
44
  X_test_vec = vectorizer.transform(X_test)
45
 
46
- # Logistic Regression Model
47
  model = LogisticRegression()
48
  model.fit(X_train_vec, y_train)
49
 
50
- # Save model and vectorizer
51
  os.makedirs("Models", exist_ok=True)
52
  joblib.dump(model, "Models/logistic_model.pkl")
53
- joblib.dump(vectorizer, "Models/vectorizer.pkl")
54
 
55
- print("✅ Logistic model trained and saved!")
 
1
  import pandas as pd
2
  import re
3
+ import os
4
+ import joblib
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.linear_model import LogisticRegression
7
+ from sklearn.model_selection import train_test_split
 
8
 
9
+ # Load data
10
  df = pd.read_csv("data/spam.csv", encoding='latin-1')
 
 
11
  df = df[['v1', 'v2']]
12
  df.columns = ['label', 'message']
 
 
13
  df.drop_duplicates(inplace=True)
 
 
14
  df['message'] = df['message'].fillna("")
 
 
15
  df['label'] = df['label'].fillna(df['label'].mode()[0])
16
 
17
+ # Clean text
18
  def clean_text(text):
19
+ text = text.lower()
20
+ text = re.sub(r'\W', ' ', text)
21
+ text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
22
+ text = re.sub(r'\s+', ' ', text)
23
  return text.strip()
24
 
25
  df['message'] = df['message'].apply(clean_text)
 
 
26
  df['label'] = df['label'].map({'ham': 0, 'spam': 1})
27
 
28
+ # Train-test split
29
+ X_train, X_test, y_train, y_test = train_test_split(
30
+ df['message'], df['label'], test_size=0.2, random_state=42
31
+ )
32
 
33
+ # Vectorize
34
+ vectorizer = TfidfVectorizer(max_features=5000)
35
  X_train_vec = vectorizer.fit_transform(X_train)
36
  X_test_vec = vectorizer.transform(X_test)
37
 
38
+ # Train model
39
  model = LogisticRegression()
40
  model.fit(X_train_vec, y_train)
41
 
42
+ # Save model
43
  os.makedirs("Models", exist_ok=True)
44
  joblib.dump(model, "Models/logistic_model.pkl")
45
+ joblib.dump(vectorizer, "Models/logvectorizer.pkl")
46
 
47
+ print("✅ Logistic model trained & saved successfully!")