quantum-drive commited on
Commit
cf27b43
Β·
verified Β·
1 Parent(s): 4365d3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +478 -393
app.py CHANGED
@@ -9,454 +9,539 @@ import whois
9
  import dns.resolver
10
  from urllib.parse import urlparse
11
  from datetime import datetime
 
 
12
 
13
  # -------------------------------
14
  # Load Trained Models
15
  # -------------------------------
16
- phishing_model = joblib.load("phishing_stack.pkl")
17
- malware_model = joblib.load("new_malware_stack.pkl")
 
 
 
 
18
 
19
  # -------------------------------
20
- # Enhanced Feature Extraction
21
  # -------------------------------
22
- def extract_phishing_features(url):
23
- parsed = urlparse(url)
24
- hostname = parsed.hostname if parsed.hostname else ""
25
- tld = hostname.split('.')[-1] if '.' in hostname else ""
26
-
27
- return {
28
- "url_length": len(url),
29
- "hostname_length": len(hostname),
30
- "num_dots": url.count('.'),
31
- "num_hyphens": url.count('-'),
32
- "num_digits": sum(char.isdigit() for char in url),
33
- "num_special_chars": len(re.findall(r"[^\w]", url)) - url.count('/'),
34
- "has_ip_address": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", hostname) else 0,
35
- "has_https": 1 if parsed.scheme == "https" else 0,
36
- "has_suspicious_words": 1 if any(word in url.lower() for word in
37
- ["login", "secure", "update", "verify", "account", "banking", "paypal"]) else 0,
38
- "is_shortened": 1 if any(short in url for short in
39
- ["bit.ly", "tinyurl", "goo.gl", "t.co", "ow.ly", "is.gd"]) else 0,
40
- "tld": tld
41
- }
42
-
43
- def extract_malware_features(url):
44
- parsed = urlparse(url)
45
- hostname = parsed.hostname or ""
46
- scheme = parsed.scheme
47
-
48
- # Basic URL features
49
- url_length = len(url)
50
- hostname_length = len(hostname)
51
- num_dots = url.count('.')
52
- num_hyphens = url.count('-')
53
- num_digits = len(re.findall(r'\d', url))
54
- special_chars = set(string.punctuation) - {'/'}
55
- num_specials = sum(1 for c in url if c in special_chars)
56
- has_suspicious_keyword = any(k in url.lower() for k in
57
- ['login', 'secure', 'verify', 'update', 'download', 'install', 'free'])
58
- has_ip = bool(re.match(r'https?://(\d{1,3}\.){3}\d{1,3}', url))
59
- is_https = scheme == 'https'
60
- is_shortened = any(s in url for s in
61
- ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'shorte.st'])
62
- tld = hostname.split('.')[-1] if '.' in hostname else ''
63
-
64
- # Network features
65
- try:
66
- ip_address = socket.gethostbyname(hostname)
67
- except:
68
- ip_address = None
69
-
70
- # WHOIS features
71
- try:
72
- w = whois.whois(url)
73
- domain_age = (datetime.now() - w.creation_date[0]).days if w.creation_date else -1
74
- domain_expiry = (w.expiration_date[0] - datetime.now()).days if w.expiration_date else -1
75
- except:
76
- domain_age = domain_expiry = -1
77
-
78
- # DNS features
79
- try:
80
- answers = dns.resolver.resolve(hostname, 'A')
81
- ttl = answers.rrset.ttl
82
- except:
83
- ttl = -1
84
 
85
- # SSL features
86
- ssl_issuer = "Unknown"
87
- ssl_valid = False
88
- if is_https and hostname:
89
- try:
90
- ctx = ssl.create_default_context()
91
- with ctx.wrap_socket(socket.socket(), server_hostname=hostname) as s:
92
- s.settimeout(3)
93
- s.connect((hostname, 443))
94
- cert = s.getpeercert()
95
- issuer = dict(x[0] for x in cert['issuer'])['organizationName']
96
- ssl_issuer = issuer if issuer else "Unknown"
97
- ssl_valid = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z') > datetime.now()
98
- except:
99
- pass
100
 
101
- return {
102
- "url_length": url_length,
103
- "hostname_length": hostname_length,
104
- "num_dots": num_dots,
105
- "num_hyphens": num_hyphens,
106
- "num_digits": num_digits,
107
- "num_special_chars": num_specials,
108
- "has_suspicious_keyword": int(has_suspicious_keyword),
109
- "has_ip_address": int(has_ip),
110
- "is_https": int(is_https),
111
- "is_shortened": int(is_shortened),
112
- "tld": tld,
113
- "domain_age_days": domain_age,
114
- "domain_expiry_days": domain_expiry,
115
- "dns_ttl": ttl,
116
- "ssl_issuer": ssl_issuer,
117
- "ssl_valid": int(ssl_valid)
118
- }
119
 
120
  # -------------------------------
121
- # Prepare Model Inputs
122
  # -------------------------------
123
- def prepare_phishing_input(url):
124
- features = extract_phishing_features(url)
125
- df = pd.DataFrame([features])
126
- df = pd.get_dummies(df, columns=["tld"], prefix="tld")
127
- df = df.reindex(columns=phishing_model.feature_names_in_, fill_value=0)
128
- return df
129
-
130
- def prepare_malware_input(url):
131
- features = extract_malware_features(url)
132
- df = pd.DataFrame([features])
133
- df = pd.get_dummies(df, columns=["tld", "ssl_issuer"], prefix=["tld", "ssl_issuer"])
134
- df = df.reindex(columns=malware_model.feature_names_in_, fill_value=0)
135
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  # -------------------------------
138
- # PREDICTION NORMALIZATION
139
  # -------------------------------
140
- def normalize_prediction(prediction):
141
- """Normalize different prediction formats to standard format"""
142
- pred_str = str(prediction).lower().strip()
143
-
144
- # Handle different formats that might come from models
145
- if pred_str in ['phishing', '1', 'malicious', 'threat', 'bad']:
146
- return 'threat'
147
- elif pred_str in ['benign', '0', 'safe', 'good', 'legitimate']:
148
- return 'benign'
149
- else:
150
- return 'unknown'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # -------------------------------
153
- # IMPROVED TRUTH TABLE DECISION LOGIC
154
  # -------------------------------
155
- # ENHANCED ANALYZE_URL FUNCTION - Replace completely
156
-
157
- def analyze_url(url):
158
  try:
159
- # Get model predictions
160
- phishing_pred_raw = phishing_model.predict(prepare_phishing_input(url))[0]
161
- malware_pred_raw = malware_model.predict(prepare_malware_input(url))[0]
162
-
163
- # Normalize predictions
164
- phishing_pred = normalize_prediction(phishing_pred_raw)
165
- malware_pred = normalize_prediction(malware_pred_raw)
166
-
167
- # Get URL features for analysis
168
  parsed = urlparse(url)
169
  hostname = parsed.hostname or ""
170
- path = parsed.path.lower()
171
 
172
- # COMPREHENSIVE THREAT DETECTION SYSTEM
173
- def comprehensive_threat_analysis(url, hostname, path):
174
- threat_score = 0
175
- threat_patterns = []
176
- threat_level = "benign"
177
-
178
- # 1. MALWARE/SUSPICIOUS CONTENT INDICATORS
179
- malware_keywords = [
180
- 'download', 'install', 'exe', 'zip', 'rar', 'crack', 'keygen', 'serial',
181
- 'patch', 'activator', 'loader', 'hack', 'cheat', 'bot', 'tool',
182
- 'generator', 'free', 'premium', 'pro', 'full', 'latest'
183
- ]
184
-
185
- malware_extensions = [
186
- '.exe', '.bat', '.cmd', '.scr', '.pif', '.jar', '.zip',
187
- '.rar', '.7z', '.apk', '.deb', '.rpm', '.dmg', '.pkg'
188
- ]
189
-
190
- # Check for malware indicators in URL
191
- for keyword in malware_keywords:
192
- if keyword in url.lower():
193
- threat_score += 3
194
- threat_patterns.append(f"Malware keyword: {keyword}")
195
-
196
- for ext in malware_extensions:
197
- if ext in url.lower():
198
- threat_score += 4
199
- threat_patterns.append(f"Executable extension: {ext}")
200
-
201
- # 2. ADULT/INAPPROPRIATE CONTENT DETECTION
202
- adult_keywords = [
203
- 'porn', 'xxx', 'sex', 'adult', 'nude', 'nsfw', 'erotic', 'cam',
204
- 'escort', 'dating', 'hookup', 'tube', 'celebflix', 'xtube'
205
- ]
206
-
207
- adult_domains = [
208
- 'pornhub', 'xvideos', 'redtube', 'youporn', 'tube8', 'xtube',
209
- 'celebflix', 'zodcaps', 'adultfriendfinder', 'ashley'
210
- ]
211
-
212
- for keyword in adult_keywords:
213
- if keyword in url.lower():
214
- threat_score += 5
215
- threat_patterns.append(f"Adult content indicator: {keyword}")
216
-
217
- for domain in adult_domains:
218
- if domain in hostname.lower():
219
- threat_score += 6
220
- threat_patterns.append(f"Adult content domain: {domain}")
221
-
222
- # 3. SUSPICIOUS DOMAIN PATTERNS
223
- suspicious_tlds = [
224
- '.tk', '.ml', '.ga', '.cf', '.pw', '.cc', '.ws', '.info',
225
- '.biz', '.click', '.download', '.stream', '.cam', '.me'
226
- ]
227
-
228
- for tld in suspicious_tlds:
229
- if hostname.endswith(tld):
230
- threat_score += 3
231
- threat_patterns.append(f"Suspicious TLD: {tld}")
232
-
233
- # 4. LONG/COMPLEX URLS (often malicious)
234
- if len(url) > 150:
235
- threat_score += 4
236
- threat_patterns.append("Extremely long URL")
237
- elif len(url) > 100:
238
- threat_score += 2
239
- threat_patterns.append("Long URL")
240
-
241
- # 5. EXCESSIVE PATH DEPTH
242
- path_segments = [seg for seg in path.split('/') if seg]
243
- if len(path_segments) > 5:
244
- threat_score += 3
245
- threat_patterns.append("Deep path structure")
246
-
247
- # 6. SUSPICIOUS CHARACTERS IN URL
248
- suspicious_chars = ['%', '&', '=', '?', '#']
249
- char_count = sum(url.count(char) for char in suspicious_chars)
250
- if char_count > 10:
251
- threat_score += 2
252
- threat_patterns.append("Many special characters")
253
-
254
- # 7. IP ADDRESS INSTEAD OF DOMAIN
255
- if re.match(r'\d+\.\d+\.\d+\.\d+', hostname):
256
- threat_score += 8
257
- threat_patterns.append("IP address used instead of domain")
258
-
259
- # 8. SUBDOMAIN ANALYSIS
260
- if hostname.count('.') > 4:
261
- threat_score += 4
262
- threat_patterns.append("Excessive subdomains")
263
-
264
- # 9. KNOWN BAD PATTERNS
265
- bad_patterns = [
266
- 'vanguard.com/totalrewards', # Your specific example
267
- 'celebflix', 'xtube', 'zodcaps', 'torrent', 'pirate'
268
- ]
269
-
270
- for pattern in bad_patterns:
271
- if pattern in url.lower():
272
- threat_score += 7
273
- threat_patterns.append(f"Known suspicious pattern: {pattern}")
274
-
275
- # 10. CRYPTOCURRENCY/FINANCIAL SCAM INDICATORS
276
- crypto_keywords = ['bitcoin', 'crypto', 'mining', 'wallet', 'investment', 'earn']
277
- for keyword in crypto_keywords:
278
- if keyword in url.lower():
279
- threat_score += 3
280
- threat_patterns.append(f"Crypto-related: {keyword}")
281
-
282
- return threat_score, threat_patterns
283
 
284
- # Get comprehensive threat analysis
285
- threat_score, threat_patterns = comprehensive_threat_analysis(url, hostname, path)
 
286
 
287
- # Check for known legitimate domains
288
- def is_verified_legitimate(hostname):
289
- verified_domains = [
290
- 'google.com', 'youtube.com', 'facebook.com', 'amazon.com', 'microsoft.com',
291
- 'apple.com', 'netflix.com', 'instagram.com', 'twitter.com', 'linkedin.com',
292
- 'github.com', 'stackoverflow.com', 'wikipedia.org', 'reddit.com', 'bbc.com',
293
- 'cnn.com', 'nytimes.com', 'forbes.com', 'techcrunch.com'
294
- ]
295
-
296
- for domain in verified_domains:
297
- if hostname == domain or hostname.endswith('.' + domain):
298
- return True
299
- return False
300
 
301
- is_legitimate = is_verified_legitimate(hostname)
 
 
 
302
 
303
- # ENHANCED DECISION LOGIC WITH THREAT SCORING
304
- if threat_score >= 15:
305
- final_result = "Malicious"
306
- reason = f"🚨 HIGH THREAT: Multiple malicious indicators (Score: {threat_score})"
307
-
308
- elif threat_score >= 10:
309
- final_result = "Suspicious"
310
- reason = f"⚠️ MEDIUM THREAT: Suspicious patterns detected (Score: {threat_score})"
311
-
312
- elif threat_score >= 6:
313
- final_result = "Suspicious"
314
- reason = f"⚠️ LOW-MEDIUM THREAT: Some concerning patterns (Score: {threat_score})"
315
-
316
- elif malware_pred == "threat" or phishing_pred == "threat":
317
- # Model detected threat even with low pattern score
318
- if malware_pred == "threat" and phishing_pred == "threat":
319
- final_result = "Malicious"
320
- reason = "🚨 Both AI models detected threats"
321
- elif malware_pred == "threat":
322
- final_result = "Malicious" if threat_score >= 3 else "Suspicious"
323
- reason = f"🦠 Malware model + patterns (Score: {threat_score})"
324
- else: # phishing_pred == "threat"
325
- final_result = "Phishing" if threat_score >= 3 else "Suspicious"
326
- reason = f"🎣 Phishing model + patterns (Score: {threat_score})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
- elif is_legitimate and threat_score <= 3:
329
- final_result = "Benign"
330
- reason = "βœ… Verified legitimate domain"
331
-
332
- elif threat_score <= 2:
333
- final_result = "Benign"
334
- reason = "βœ… Low threat score, appears safe"
335
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  else:
337
- final_result = "Suspicious"
338
- reason = f"⚠️ Pattern analysis suggests caution (Score: {threat_score})"
 
 
339
 
340
  return {
341
- "url": url,
342
- "final_result": final_result,
343
- "decision_reason": reason,
344
- "phishing_model_prediction": str(phishing_pred_raw),
345
- "malware_model_prediction": str(malware_pred_raw),
346
- "normalized_phishing": phishing_pred,
347
- "normalized_malware": malware_pred,
348
- "threat_score": threat_score,
349
- "threat_patterns": threat_patterns[:4], # Top 4 patterns
350
- "analysis_type": "Enhanced Pattern + AI Model Analysis"
351
  }
352
 
353
  except Exception as e:
354
- return {"error": str(e)}
 
 
 
 
 
 
355
 
356
  # -------------------------------
357
- # GRADIO INTERFACE
358
  # -------------------------------
359
- # UPDATED INTERFACE FUNCTION
 
 
 
 
 
 
360
 
361
- def interface_fn(url):
362
- if not url.strip():
363
- return "❌ Please enter a valid URL"
 
 
364
 
365
- # Add protocol if missing
 
366
  if not url.startswith(('http://', 'https://')):
367
  url = 'https://' + url
368
 
369
- result = analyze_url(url)
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
- if "error" in result:
372
- return f"❌ Error analyzing URL: {result['error']}"
373
 
374
- # Format output for better readability
375
- output = f"""
376
- πŸ” Analysis Report for: {result['url']}
377
- ⚠️ Final Verdict: {result['final_result']}
378
- πŸ“Œ Decision Reason: {result['decision_reason']}
379
- πŸ”’ Phishing Model: {result['phishing_model_prediction']} (normalized: {result['normalized_phishing']})
380
- πŸ›‘οΈ Malware Model: {result['malware_model_prediction']} (normalized: {result['normalized_malware']})
381
- 🎯 Threat Score: {result['threat_score']}/30
382
- πŸ“Š Analysis Type: {result['analysis_type']}
 
 
383
  """
384
 
385
- # Add threat patterns if any
386
- if result['threat_patterns']:
387
- output += f"🚩 Detected Threats: {', '.join(result['threat_patterns'])}\n"
388
-
389
- output += "=" * 60
390
-
391
- # Add appropriate emoji and color coding based on threat score
392
- if result['final_result'] == "Benign":
393
- output = "βœ… SAFE " + output
394
- elif result['final_result'] == "Malicious":
395
- output = "🚨 DANGEROUS " + output
396
- elif result['final_result'] == "Phishing":
397
- output = "🎣 PHISHING " + output
398
- else: # Suspicious
399
- if result['threat_score'] >= 10:
400
- output = "⚠️ HIGH RISK " + output
401
- elif result['threat_score'] >= 6:
402
- output = "⚠️ MEDIUM RISK " + output
403
- else:
404
- output = "⚠️ LOW RISK " + output
405
-
406
- return output
 
 
 
 
 
 
 
 
 
 
 
407
 
408
  # -------------------------------
409
- # GRADIO APP
410
  # -------------------------------
411
- demo = gr.Interface(
412
- fn=interface_fn,
413
- inputs=gr.Text(
414
- label="Enter URL to Analyze",
415
- placeholder="https://example.com or just example.com",
416
- lines=1
417
- ),
418
- outputs=gr.Textbox(
419
- label="πŸ›‘οΈ Threat Analysis Report",
420
- lines=10,
421
- max_lines=15
422
- ),
423
- title="πŸ›‘οΈ AI-Powered URL Threat Analyzer",
424
- description="""
425
- **Advanced URL Security Scanner**
426
-
427
- This tool uses dual AI models to detect:
428
- β€’ 🎣 Phishing attacks
429
- β€’ 🦠 Malware threats
430
- β€’ πŸ”’ Overall URL safety
431
-
432
- Enter any URL to get a comprehensive security analysis.
433
- """,
434
- examples=[
435
- ["https://www.google.com"],
436
- ["https://www.paypal.com/signin"],
437
- ["https://www.bbc.com/news"],
438
- ["bit.ly/suspicious-link"],
439
- ["http://malware-site.ru/download.exe"]
440
- ],
441
- theme=gr.themes.Soft(),
442
- css="""
443
  .gradio-container {
444
- max-width: 800px;
445
  margin: auto;
 
446
  }
447
- """,
448
-
449
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450
 
 
 
 
451
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  demo.launch(
453
  share=True,
454
  server_name="0.0.0.0",
455
  server_port=7860,
456
  show_error=True,
457
- # Add these parameters to ensure API works
458
- show_api=True
459
-
460
- )
461
-
462
-
 
9
  import dns.resolver
10
  from urllib.parse import urlparse
11
  from datetime import datetime
12
+ import requests
13
+ from collections import Counter
14
 
15
  # -------------------------------
16
  # Load Trained Models
17
  # -------------------------------
18
+ try:
19
+ phishing_model = joblib.load("phishing_stack.pkl")
20
+ malware_model = joblib.load("new_malware_stack.pkl")
21
+ except FileNotFoundError as e:
22
+ print(f"Model file not found: {e}")
23
+ phishing_model, malware_model = None, None
24
 
25
  # -------------------------------
26
+ # TRUSTED DOMAINS DATABASE
27
  # -------------------------------
28
+ TRUSTED_DOMAINS = {
29
+ # Major Tech Companies
30
+ 'google.com', 'youtube.com', 'gmail.com', 'gstatic.com', 'googleapis.com',
31
+ 'facebook.com', 'instagram.com', 'whatsapp.com', 'messenger.com',
32
+ 'microsoft.com', 'outlook.com', 'office.com', 'xbox.com', 'bing.com',
33
+ 'apple.com', 'icloud.com', 'itunes.com', 'app-store.com',
34
+ 'amazon.com', 'aws.amazon.com', 'amazonwebservices.com',
35
+
36
+ # Social Media & Communication
37
+ 'twitter.com', 'x.com', 'linkedin.com', 'reddit.com', 'pinterest.com',
38
+ 'snapchat.com', 'tiktok.com', 'discord.com', 'telegram.org',
39
+ 'zoom.us', 'skype.com', 'teams.microsoft.com',
40
+
41
+ # News & Media
42
+ 'bbc.com', 'cnn.com', 'nytimes.com', 'reuters.com', 'bloomberg.com',
43
+ 'forbes.com', 'techcrunch.com', 'theverge.com', 'arstechnica.com',
44
+
45
+ # Banking & Finance
46
+ 'paypal.com', 'stripe.com', 'visa.com', 'mastercard.com',
47
+ 'chase.com', 'bankofamerica.com', 'wells.com', 'citibank.com',
48
+
49
+ # E-commerce
50
+ 'ebay.com', 'etsy.com', 'shopify.com', 'walmart.com', 'target.com',
51
+
52
+ # Development & Tech
53
+ 'github.com', 'stackoverflow.com', 'mozilla.org', 'w3.org',
54
+ 'cloudflare.com', 'docker.com', 'npmjs.com',
55
+
56
+ # Educational
57
+ 'wikipedia.org', 'wikimedia.org', 'mit.edu', 'stanford.edu',
58
+ 'coursera.org', 'udemy.com', 'khanacademy.org',
59
+
60
+ # Government
61
+ 'gov', 'edu', 'mil', 'org' # Top-level domains
62
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ # Suspicious TLDs that are often misused
65
+ SUSPICIOUS_TLDS = {
66
+ '.tk', '.ml', '.ga', '.cf', '.pw', '.cc', '.ws', '.info',
67
+ '.biz', '.click', '.download', '.stream', '.cam', '.me',
68
+ '.top', '.work', '.date', '.review', '.country', '.racing'
69
+ }
 
 
 
 
 
 
 
 
 
70
 
71
+ # Known malicious patterns
72
+ MALICIOUS_PATTERNS = {
73
+ 'phishing_keywords': [
74
+ 'verify-account', 'account-suspended', 'urgent-action',
75
+ 'click-here-now', 'limited-time', 'act-now', 'winner',
76
+ 'congratulations', 'claim-prize', 'free-money', 'inheritance'
77
+ ],
78
+ 'malware_keywords': [
79
+ 'crack', 'keygen', 'serial', 'patch', 'activator', 'loader',
80
+ 'hack-tool', 'cheat-engine', 'bot-download', 'virus-scan',
81
+ 'antivirus-update', 'flash-player-update'
82
+ ],
83
+ 'adult_keywords': [
84
+ 'xxx', 'porn', 'adult', 'sex', 'nude', 'erotic', 'cam',
85
+ 'escort', 'dating-hookup', 'live-cam', 'webcam'
86
+ ]
87
+ }
 
88
 
89
  # -------------------------------
90
+ # ENHANCED WHITELISTING SYSTEM
91
  # -------------------------------
92
+ def is_trusted_domain(hostname):
93
+ """Enhanced domain trust verification"""
94
+ if not hostname:
95
+ return False
96
+
97
+ hostname = hostname.lower().strip()
98
+
99
+ # Direct match
100
+ if hostname in TRUSTED_DOMAINS:
101
+ return True
102
+
103
+ # Subdomain check for trusted domains
104
+ for trusted in TRUSTED_DOMAINS:
105
+ if hostname.endswith('.' + trusted):
106
+ return True
107
+
108
+ # Government and educational domains
109
+ if hostname.endswith(('.gov', '.edu', '.mil')):
110
+ return True
111
+
112
+ # Major organization domains
113
+ if hostname.endswith('.org') and any(org in hostname for org in
114
+ ['wikipedia', 'mozilla', 'apache', 'gnu']):
115
+ return True
116
+
117
+ return False
118
 
119
  # -------------------------------
120
+ # INTELLIGENT FEATURE EXTRACTION
121
  # -------------------------------
122
+ def extract_enhanced_features(url):
123
+ """Extract comprehensive features with intelligence"""
124
+ parsed = urlparse(url)
125
+ hostname = parsed.hostname or ""
126
+ path = parsed.path or ""
127
+ query = parsed.query or ""
128
+
129
+ # Basic metrics
130
+ url_length = len(url)
131
+ hostname_length = len(hostname)
132
+ path_length = len(path)
133
+
134
+ # Character analysis
135
+ num_dots = hostname.count('.')
136
+ num_hyphens = hostname.count('-')
137
+ num_underscores = hostname.count('_')
138
+ num_digits = sum(c.isdigit() for c in hostname)
139
+
140
+ # Suspicious patterns
141
+ has_ip = bool(re.match(r'^(\d{1,3}\.){3}\d{1,3}$', hostname))
142
+ is_https = parsed.scheme == 'https'
143
+
144
+ # URL shortening services
145
+ shorteners = ['bit.ly', 'tinyurl', 'goo.gl', 't.co', 'ow.ly', 'is.gd', 'short.link']
146
+ is_shortened = any(short in hostname for short in shorteners)
147
+
148
+ # Suspicious TLD
149
+ tld = hostname.split('.')[-1] if '.' in hostname else ""
150
+ has_suspicious_tld = '.' + tld in SUSPICIOUS_TLDS
151
+
152
+ # Keyword analysis
153
+ url_lower = url.lower()
154
+ phishing_score = sum(1 for keyword in MALICIOUS_PATTERNS['phishing_keywords']
155
+ if keyword in url_lower)
156
+ malware_score = sum(1 for keyword in MALICIOUS_PATTERNS['malware_keywords']
157
+ if keyword in url_lower)
158
+ adult_score = sum(1 for keyword in MALICIOUS_PATTERNS['adult_keywords']
159
+ if keyword in url_lower)
160
+
161
+ # Path analysis
162
+ suspicious_extensions = ['.exe', '.bat', '.scr', '.zip', '.rar', '.apk']
163
+ has_suspicious_extension = any(ext in path.lower() for ext in suspicious_extensions)
164
+
165
+ # Query parameter analysis
166
+ num_params = len(query.split('&')) if query else 0
167
+ has_encoded_chars = '%' in url
168
+
169
+ return {
170
+ 'url_length': url_length,
171
+ 'hostname_length': hostname_length,
172
+ 'path_length': path_length,
173
+ 'num_dots': num_dots,
174
+ 'num_hyphens': num_hyphens,
175
+ 'num_underscores': num_underscores,
176
+ 'num_digits': num_digits,
177
+ 'has_ip': int(has_ip),
178
+ 'is_https': int(is_https),
179
+ 'is_shortened': int(is_shortened),
180
+ 'has_suspicious_tld': int(has_suspicious_tld),
181
+ 'phishing_score': phishing_score,
182
+ 'malware_score': malware_score,
183
+ 'adult_score': adult_score,
184
+ 'has_suspicious_extension': int(has_suspicious_extension),
185
+ 'num_params': num_params,
186
+ 'has_encoded_chars': int(has_encoded_chars),
187
+ 'is_trusted': int(is_trusted_domain(hostname)),
188
+ 'tld': tld
189
+ }
190
 
191
  # -------------------------------
192
+ # SMART PREDICTION SYSTEM
193
  # -------------------------------
194
+ def smart_url_analysis(url):
195
+ """Intelligent URL analysis with multiple validation layers"""
196
+
197
  try:
 
 
 
 
 
 
 
 
 
198
  parsed = urlparse(url)
199
  hostname = parsed.hostname or ""
 
200
 
201
+ # Layer 1: Trusted Domain Check (Highest Priority)
202
+ if is_trusted_domain(hostname):
203
+ return {
204
+ 'final_verdict': 'Safe',
205
+ 'confidence': 95,
206
+ 'reason': 'Verified trusted domain',
207
+ 'threat_level': 'None',
208
+ 'details': f'βœ… {hostname} is a verified legitimate domain'
209
+ }
210
+
211
+ # Layer 2: Extract features
212
+ features = extract_enhanced_features(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+ # Layer 3: Rule-based pre-screening
215
+ immediate_threats = []
216
+ threat_score = 0
217
 
218
+ # IP address instead of domain
219
+ if features['has_ip']:
220
+ threat_score += 25
221
+ immediate_threats.append('Using IP address instead of domain name')
 
 
 
 
 
 
 
 
 
222
 
223
+ # Suspicious TLD
224
+ if features['has_suspicious_tld']:
225
+ threat_score += 15
226
+ immediate_threats.append(f'Suspicious top-level domain (.{features["tld"]})')
227
 
228
+ # Malicious keywords
229
+ if features['phishing_score'] > 2:
230
+ threat_score += 20
231
+ immediate_threats.append('Multiple phishing-related keywords detected')
232
+ elif features['phishing_score'] > 0:
233
+ threat_score += 10
234
+ immediate_threats.append('Phishing-related keywords found')
235
+
236
+ if features['malware_score'] > 1:
237
+ threat_score += 25
238
+ immediate_threats.append('Malware-related keywords detected')
239
+
240
+ if features['adult_score'] > 1:
241
+ threat_score += 15
242
+ immediate_threats.append('Adult content indicators')
243
+
244
+ # Suspicious file extensions
245
+ if features['has_suspicious_extension']:
246
+ threat_score += 20
247
+ immediate_threats.append('Suspicious file extension detected')
248
+
249
+ # URL characteristics
250
+ if features['url_length'] > 200:
251
+ threat_score += 10
252
+ immediate_threats.append('Extremely long URL')
253
+ elif features['url_length'] > 100:
254
+ threat_score += 5
255
+ immediate_threats.append('Long URL')
256
+
257
+ # Too many subdomains
258
+ if features['num_dots'] > 5:
259
+ threat_score += 15
260
+ immediate_threats.append('Excessive subdomains')
261
+
262
+ # Layer 4: ML Model predictions (if available)
263
+ ml_phishing_threat = False
264
+ ml_malware_threat = False
265
+
266
+ if phishing_model and malware_model:
267
+ try:
268
+ # Prepare data for models
269
+ feature_df = pd.DataFrame([features])
270
 
271
+ # Get model predictions
272
+ phishing_pred = phishing_model.predict(feature_df)[0]
273
+ malware_pred = malware_model.predict(feature_df)[0]
274
+
275
+ # Interpret predictions (adjust based on your model's output format)
276
+ ml_phishing_threat = str(phishing_pred).lower() in ['1', 'phishing', 'malicious']
277
+ ml_malware_threat = str(malware_pred).lower() in ['1', 'malware', 'malicious']
278
+
279
+ # Only add ML threat score if rule-based score is already high
280
+ if ml_phishing_threat and threat_score > 10:
281
+ threat_score += 15
282
+ immediate_threats.append('ML model detected phishing patterns')
283
+
284
+ if ml_malware_threat and threat_score > 10:
285
+ threat_score += 15
286
+ immediate_threats.append('ML model detected malware patterns')
287
+
288
+ except Exception as e:
289
+ print(f"Model prediction error: {e}")
290
+
291
+ # Layer 5: Final decision making
292
+ if threat_score >= 50:
293
+ verdict = 'Malicious'
294
+ confidence = min(95, 60 + threat_score)
295
+ threat_level = 'High'
296
+ reason = 'Multiple high-risk indicators detected'
297
+ elif threat_score >= 30:
298
+ verdict = 'Suspicious'
299
+ confidence = min(85, 50 + threat_score)
300
+ threat_level = 'Medium'
301
+ reason = 'Several concerning patterns identified'
302
+ elif threat_score >= 15:
303
+ verdict = 'Potentially Risky'
304
+ confidence = min(75, 40 + threat_score)
305
+ threat_level = 'Low-Medium'
306
+ reason = 'Some suspicious indicators present'
307
+ elif threat_score >= 5:
308
+ verdict = 'Caution Advised'
309
+ confidence = 60
310
+ threat_level = 'Low'
311
+ reason = 'Minor risk indicators detected'
312
  else:
313
+ verdict = 'Likely Safe'
314
+ confidence = max(70, 90 - threat_score)
315
+ threat_level = 'Minimal'
316
+ reason = 'No significant threats detected'
317
 
318
  return {
319
+ 'final_verdict': verdict,
320
+ 'confidence': confidence,
321
+ 'reason': reason,
322
+ 'threat_level': threat_level,
323
+ 'threat_score': threat_score,
324
+ 'details': immediate_threats[:5], # Top 5 threats
325
+ 'ml_predictions': {
326
+ 'phishing': ml_phishing_threat,
327
+ 'malware': ml_malware_threat
328
+ } if phishing_model and malware_model else None
329
  }
330
 
331
  except Exception as e:
332
+ return {
333
+ 'final_verdict': 'Analysis Error',
334
+ 'confidence': 0,
335
+ 'reason': f'Error during analysis: {str(e)}',
336
+ 'threat_level': 'Unknown',
337
+ 'details': []
338
+ }
339
 
340
  # -------------------------------
341
+ # ENHANCED UI INTERFACE
342
  # -------------------------------
343
+ def analyze_url_interface(url):
344
+ """Enhanced interface function with better formatting"""
345
+
346
+ if not url or not url.strip():
347
+ return """
348
+ 🚫 **Error: No URL Provided**
349
+ Please enter a valid URL to analyze.
350
 
351
+ **Example formats:**
352
+ β€’ https://example.com
353
+ β€’ http://suspicious-site.com
354
+ β€’ just-domain.com (we'll add https://)
355
+ """
356
 
357
+ # Clean and prepare URL
358
+ url = url.strip()
359
  if not url.startswith(('http://', 'https://')):
360
  url = 'https://' + url
361
 
362
+ # Perform analysis
363
+ result = smart_url_analysis(url)
364
+
365
+ # Create emoji indicators
366
+ verdict_emoji = {
367
+ 'Safe': 'βœ…',
368
+ 'Likely Safe': 'βœ…',
369
+ 'Caution Advised': '⚠️',
370
+ 'Potentially Risky': '⚠️',
371
+ 'Suspicious': 'πŸ”΄',
372
+ 'Malicious': '🚨',
373
+ 'Analysis Error': '❌'
374
+ }
375
 
376
+ confidence_bar = "β–ˆ" * (result['confidence'] // 10) + "β–‘" * (10 - result['confidence'] // 10)
 
377
 
378
+ # Format main report
379
+ report = f"""
380
+ {verdict_emoji.get(result['final_verdict'], '❓')} **SECURITY ANALYSIS REPORT**
381
+ ═══════════════════════════════════════════════
382
+
383
+ 🌐 **URL:** {url}
384
+
385
+ πŸ›‘οΈ **Security Verdict:** {result['final_verdict']}
386
+ πŸ“Š **Confidence Level:** {result['confidence']}% {confidence_bar}
387
+ ⚑ **Threat Level:** {result['threat_level']}
388
+ πŸ’‘ **Primary Reason:** {result['reason']}
389
  """
390
 
391
+ # Add threat score if applicable
392
+ if 'threat_score' in result:
393
+ threat_bar = "πŸ”₯" * min(10, result['threat_score'] // 5) + "β–‘" * max(0, 10 - result['threat_score'] // 5)
394
+ report += f"🎯 **Threat Score:** {result['threat_score']}/100 {threat_bar}\n"
395
+
396
+ # Add detailed findings
397
+ if result['details']:
398
+ report += f"\nπŸ” **Detailed Findings:**\n"
399
+ for i, detail in enumerate(result['details'], 1):
400
+ report += f" {i}. {detail}\n"
401
+
402
+ # Add ML predictions if available
403
+ if result.get('ml_predictions'):
404
+ report += f"\nπŸ€– **AI Model Analysis:**\n"
405
+ report += f" β€’ Phishing Detection: {'⚠️ Detected' if result['ml_predictions']['phishing'] else 'βœ… Clear'}\n"
406
+ report += f" β€’ Malware Detection: {'⚠️ Detected' if result['ml_predictions']['malware'] else 'βœ… Clear'}\n"
407
+
408
+ # Add recommendations
409
+ report += f"\nπŸ’‘ **Recommendations:**\n"
410
+
411
+ if result['final_verdict'] in ['Safe', 'Likely Safe']:
412
+ report += " βœ… This URL appears safe to visit\n βœ… Standard security practices still recommended\n"
413
+ elif result['final_verdict'] in ['Caution Advised', 'Potentially Risky']:
414
+ report += " ⚠️ Exercise caution when visiting\n ⚠️ Verify the site's legitimacy before entering personal data\n"
415
+ elif result['final_verdict'] in ['Suspicious', 'Malicious']:
416
+ report += " 🚨 **DO NOT VISIT** this URL\n 🚨 Consider it a security threat\n 🚨 Report if received via email/message\n"
417
+ else:
418
+ report += " ❓ Unable to determine safety - proceed with extreme caution\n"
419
+
420
+ report += "\n" + "═" * 50
421
+ report += f"\n⏰ Analysis completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
422
+
423
+ return report
424
 
425
  # -------------------------------
426
+ # GRADIO APPLICATION
427
  # -------------------------------
428
+ def create_interface():
429
+ """Create enhanced Gradio interface"""
430
+
431
+ # Custom CSS for better appearance
432
+ custom_css = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  .gradio-container {
434
+ max-width: 900px !important;
435
  margin: auto;
436
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
437
  }
438
+
439
+ .input-container {
440
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
441
+ padding: 20px;
442
+ border-radius: 15px;
443
+ margin-bottom: 20px;
444
+ }
445
+
446
+ .output-container {
447
+ background: #f8f9fa;
448
+ border: 2px solid #e9ecef;
449
+ border-radius: 10px;
450
+ padding: 15px;
451
+ }
452
+
453
+ .title {
454
+ text-align: center;
455
+ color: #2c3e50;
456
+ font-weight: bold;
457
+ margin-bottom: 10px;
458
+ }
459
+
460
+ .description {
461
+ text-align: center;
462
+ color: #34495e;
463
+ font-size: 16px;
464
+ margin-bottom: 20px;
465
+ }
466
+ """
467
+
468
+ # Example URLs for testing
469
+ example_urls = [
470
+ ["https://www.google.com"],
471
+ ["https://github.com"],
472
+ ["https://www.paypal.com"],
473
+ ["http://suspicious-site.tk"],
474
+ ["bit.ly/malicious-link"],
475
+ ["http://192.168.1.1/download.exe"]
476
+ ]
477
+
478
+ # Create interface
479
+ demo = gr.Interface(
480
+ fn=analyze_url_interface,
481
+ inputs=gr.Textbox(
482
+ label="πŸ” Enter URL to Analyze",
483
+ placeholder="Enter URL here (e.g., https://example.com or just example.com)",
484
+ lines=1,
485
+ max_lines=1,
486
+ elem_classes="input-container"
487
+ ),
488
+ outputs=gr.Textbox(
489
+ label="πŸ›‘οΈ Security Analysis Report",
490
+ lines=20,
491
+ max_lines=25,
492
+ elem_classes="output-container",
493
+ show_copy_button=True
494
+ ),
495
+ title="πŸ›‘οΈ Advanced URL Security Analyzer",
496
+ description="""
497
+ **Professional-Grade URL Threat Detection System**
498
+
499
+ 🎯 **Features:**
500
+ β€’ Dual AI model analysis for phishing and malware detection
501
+ β€’ Real-time threat pattern recognition
502
+ β€’ Trusted domain verification system
503
+ β€’ Comprehensive risk scoring algorithm
504
+
505
+ πŸ”’ **Protection Against:**
506
+ β€’ Phishing websites β€’ Malware distribution sites β€’ Suspicious short links β€’ Adult content β€’ Financial scams
507
+
508
+ Simply paste any URL below to get instant security analysis!
509
+ """,
510
+ examples=example_urls,
511
+ theme=gr.themes.Soft(
512
+ primary_hue="blue",
513
+ secondary_hue="gray",
514
+ neutral_hue="slate"
515
+ ),
516
+ css=custom_css,
517
+ analytics_enabled=False,
518
+ allow_flagging="never"
519
+ )
520
+
521
+ return demo
522
 
523
+ # -------------------------------
524
+ # MAIN APPLICATION
525
+ # -------------------------------
526
  if __name__ == "__main__":
527
+ print("πŸš€ Starting Enhanced URL Security Analyzer...")
528
+ print("πŸ”§ Loading models and initializing system...")
529
+
530
+ # Verify models are loaded
531
+ if not phishing_model or not malware_model:
532
+ print("⚠️ Warning: ML models not found. Running with rule-based analysis only.")
533
+ else:
534
+ print("βœ… ML models loaded successfully!")
535
+
536
+ # Create and launch interface
537
+ demo = create_interface()
538
+
539
+ print("🌐 Launching web interface...")
540
  demo.launch(
541
  share=True,
542
  server_name="0.0.0.0",
543
  server_port=7860,
544
  show_error=True,
545
+ show_api=True,
546
+ quiet=False
547
+ )