quantum-drive commited on
Commit
dd2c09d
·
verified ·
1 Parent(s): c2e1640

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -97
app.py CHANGED
@@ -161,60 +161,136 @@ def prepare_malware_input(url):
161
  return df
162
 
163
  # -------------------------------
164
- # RISK SCORING SYSTEM
165
  # -------------------------------
166
  def calculate_phishing_risk(features):
167
- """Calculate phishing risk score based on key indicators"""
168
  risk_score = 0
169
 
170
- # High-risk indicators
171
  if features['has_ip_address']:
172
- risk_score += 30
173
  if features['is_shortened']:
174
- risk_score += 25
175
- if features['suspicious_tld']:
176
- risk_score += 20
177
  if features['has_suspicious_words']:
178
- risk_score += 15
179
 
180
- # Medium-risk indicators
181
- risk_score += min(features['num_special_chars'] * 3, 15)
182
- risk_score += min(features['num_hyphens'] * 2, 10)
183
- risk_score += min(features['path_keyword_count'] * 10, 20)
 
184
 
185
- # Length-based risk
 
 
186
  if features['url_length'] > 75:
187
  risk_score += 10
188
-
 
 
189
  return min(risk_score, 100)
190
 
191
  def calculate_malware_risk(features):
192
- """Calculate malware risk score based on key indicators"""
193
  risk_score = 0
194
 
195
- # High-risk indicators
196
  if features['has_ip_address']:
197
- risk_score += 30
198
- if features['is_shortened']:
199
- risk_score += 25
200
  if features['has_suspicious_keyword']:
201
- risk_score += 20
202
- if features['path_keyword_count'] > 0:
203
- risk_score += 15
204
- if features['domain_age_days'] < 30 or features['domain_age_days'] > 365*10:
205
- risk_score += 20
 
 
 
 
 
 
 
 
 
206
 
207
- # Medium-risk indicators
208
- if features['dns_ttl'] < 300: # Low TTL often indicates malicious domains
209
- risk_score += 15
210
  if not features['ssl_valid'] and features['is_https']:
 
 
 
 
 
211
  risk_score += 10
212
 
213
  return min(risk_score, 100)
214
 
215
  # -------------------------------
216
- # ADVANCED TRUTH TABLE DECISION LOGIC
217
  # -------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  def analyze_url(url):
219
  try:
220
  # Extract features
@@ -229,51 +305,20 @@ def analyze_url(url):
229
  phishing_pred = phishing_model.predict(phishing_df)[0]
230
  malware_pred = malware_model.predict(malware_df)[0]
231
 
232
- # Calculate risk scores
233
  phishing_risk = calculate_phishing_risk(phishing_features)
234
  malware_risk = calculate_malware_risk(malware_features)
235
 
236
- # BALANCED TRUTH TABLE DECISION MATRIX
237
- # Case 1: Both models detect threats
238
- if phishing_pred == "Phishing" and malware_pred == "malicious":
239
- # Give priority based on risk scores and indicators
240
- if malware_risk > phishing_risk and malware_risk >= 60:
241
- final = "Malicious"
242
- reason = "Both models detected threat - malware risk indicators stronger"
243
- else:
244
- final = "Phishing"
245
- reason = "Both models detected threat - phishing indicators stronger"
246
-
247
- # Case 2: Only malware model detects threat
248
- elif malware_pred == "malicious" and phishing_pred != "Phishing":
249
- final = "Malicious"
250
- reason = "Malware model detected malicious threat"
251
-
252
- # Case 3: Only phishing model detects threat
253
- elif phishing_pred == "Phishing" and malware_pred != "malicious":
254
- final = "Phishing"
255
- reason = "Phishing model detected threat"
256
-
257
- # Case 4: Both models report benign - use risk-based detection
258
- else:
259
- if malware_risk >= 70:
260
- final = "Malicious"
261
- reason = "Models reported benign but high malware risk indicators detected"
262
- elif phishing_risk >= 70:
263
- final = "Phishing"
264
- reason = "Models reported benign but high phishing risk indicators detected"
265
- elif malware_risk >= 50 or phishing_risk >= 50:
266
- final = "Suspicious"
267
- reason = "Models reported benign but moderate risk indicators present"
268
- else:
269
- final = "Benign"
270
- reason = "No threats detected by models or risk indicators"
271
 
272
  # Prepare detailed report
273
  report = {
274
  "url": url,
275
- "final_result": final,
276
- "decision_reason": reason,
277
  "phishing": {
278
  "prediction": phishing_pred,
279
  "risk_score": phishing_risk,
@@ -282,7 +327,8 @@ def analyze_url(url):
282
  "is_shortened": bool(phishing_features['is_shortened']),
283
  "suspicious_tld": bool(phishing_features['suspicious_tld']),
284
  "suspicious_words": bool(phishing_features['has_suspicious_words']),
285
- "path_keywords": phishing_features['path_keyword_count']
 
286
  }
287
  },
288
  "malware": {
@@ -292,9 +338,15 @@ def analyze_url(url):
292
  "has_ip": bool(malware_features['has_ip_address']),
293
  "is_shortened": bool(malware_features['is_shortened']),
294
  "suspicious_keywords": bool(malware_features['has_suspicious_keyword']),
295
- "new_domain": malware_features['domain_age_days'] < 365,
296
- "low_ttl": malware_features['dns_ttl'] < 300 and malware_features['dns_ttl'] > 0
 
297
  }
 
 
 
 
 
298
  }
299
  }
300
 
@@ -311,31 +363,39 @@ def interface_fn(url):
311
  if "error" in result:
312
  return f"❌ Error: {result['error']}"
313
 
314
- # Format output
315
  output = f"""
316
- 🔍 URL Analysis Report: {result['url']}
317
- 🎯 Final Verdict: {result['final_result']}
318
- 📌 Reason: {result['decision_reason']}
319
-
320
- 🔒 Phishing Analysis:
321
- - Prediction: {result['phishing']['prediction']}
322
- - Risk Score: {result['phishing']['risk_score']}/100
323
- - Key Indicators:
324
- IP Address: {result['phishing']['key_indicators']['has_ip']}
325
- Shortened URL: {result['phishing']['key_indicators']['is_shortened']}
326
- Suspicious TLD: {result['phishing']['key_indicators']['suspicious_tld']}
327
- • Suspicious Words: {result['phishing']['key_indicators']['suspicious_words']}
328
- Path Keywords: {result['phishing']['key_indicators']['path_keywords']}
329
-
330
- 🛡️ Malware Analysis:
331
- - Prediction: {result['malware']['prediction']}
332
- - Risk Score: {result['malware']['risk_score']}/100
333
- - Key Indicators:
334
- IP Address: {result['malware']['key_indicators']['has_ip']}
335
- Shortened URL: {result['malware']['key_indicators']['is_shortened']}
336
- Suspicious Keywords: {result['malware']['key_indicators']['suspicious_keywords']}
337
- New Domain (<1yr): {result['malware']['key_indicators']['new_domain']}
338
- Low DNS TTL: {result['malware']['key_indicators']['low_ttl']}
 
 
 
 
 
 
 
 
339
  """
340
 
341
  return output
@@ -343,15 +403,17 @@ def interface_fn(url):
343
  demo = gr.Interface(
344
  fn=interface_fn,
345
  inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
346
- outputs=gr.Textbox(label="Threat Analysis Report", lines=20),
347
- title="🛡�� Advanced URL Threat Analyzer",
348
- description="Multi-layered detection system combining AI models with risk indicators",
349
  examples=[
350
  ["https://www.paypal-login-secure.com/verify"],
351
  ["https://free-movie-downloads.xyz/get.exe"],
352
  ["https://www.microsoft.com/en-us/"],
353
  ["http://192.168.1.100/install-update"],
354
- ["https://secure-apple-id-confirm.com"]
 
 
355
  ],
356
  theme="soft"
357
  )
 
161
  return df
162
 
163
  # -------------------------------
164
+ # ENHANCED RISK SCORING SYSTEM
165
  # -------------------------------
166
  def calculate_phishing_risk(features):
167
+ """Calculate enhanced phishing risk score"""
168
  risk_score = 0
169
 
170
+ # Critical indicators (High Weight)
171
  if features['has_ip_address']:
172
+ risk_score += 35 # IP addresses are major red flag for phishing
173
  if features['is_shortened']:
174
+ risk_score += 30 # URL shorteners commonly used in phishing
 
 
175
  if features['has_suspicious_words']:
176
+ risk_score += 25 # Banking/login terms are key phishing indicators
177
 
178
+ # Important indicators (Medium Weight)
179
+ if features['suspicious_tld']:
180
+ risk_score += 20 # Suspicious TLDs often used for phishing
181
+ risk_score += min(features['path_keyword_count'] * 15, 30) # Keywords in path
182
+ risk_score += min(features['query_keyword_count'] * 10, 20) # Keywords in query
183
 
184
+ # Supporting indicators (Low Weight)
185
+ risk_score += min(features['num_special_chars'] * 2, 15)
186
+ risk_score += min(features['num_hyphens'] * 3, 15)
187
  if features['url_length'] > 75:
188
  risk_score += 10
189
+ if not features['has_https']:
190
+ risk_score += 5 # No HTTPS is suspicious for login pages
191
+
192
  return min(risk_score, 100)
193
 
194
  def calculate_malware_risk(features):
195
+ """Calculate enhanced malware risk score"""
196
  risk_score = 0
197
 
198
+ # Critical indicators (High Weight)
199
  if features['has_ip_address']:
200
+ risk_score += 35 # Direct IP access common in malware
 
 
201
  if features['has_suspicious_keyword']:
202
+ risk_score += 30 # Download/crack keywords are major indicators
203
+ if features['is_shortened']:
204
+ risk_score += 25 # URL shorteners hide malicious destinations
205
+
206
+ # Important indicators (Medium Weight)
207
+ risk_score += min(features['path_keyword_count'] * 20, 40) # Malware keywords in path
208
+
209
+ # Domain age indicators
210
+ if 0 <= features['domain_age_days'] < 30:
211
+ risk_score += 25 # Very new domains are highly suspicious
212
+ elif 30 <= features['domain_age_days'] < 90:
213
+ risk_score += 15 # New domains are suspicious
214
+ elif features['domain_age_days'] > 365*15: # Very old domains can be compromised
215
+ risk_score += 10
216
 
217
+ # Network indicators
218
+ if 0 < features['dns_ttl'] < 300:
219
+ risk_score += 20 # Low TTL indicates fast-flux hosting
220
  if not features['ssl_valid'] and features['is_https']:
221
+ risk_score += 15 # Invalid SSL certificate
222
+
223
+ # Supporting indicators
224
+ risk_score += min(features['num_special_chars'] * 2, 10)
225
+ if features['url_length'] > 100:
226
  risk_score += 10
227
 
228
  return min(risk_score, 100)
229
 
230
  # -------------------------------
231
+ # LOGICAL TRUTH TABLE DECISION SYSTEM
232
  # -------------------------------
233
+ def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk):
234
+ """
235
+ Enhanced logical truth table for accurate classification
236
+
237
+ Truth Table Logic:
238
+ - Model predictions have primary weight
239
+ - Risk scores provide secondary validation and override capability
240
+ - Clear thresholds prevent misclassification
241
+ """
242
+
243
+ # Define risk thresholds
244
+ HIGH_RISK = 70
245
+ MEDIUM_RISK = 45
246
+ LOW_RISK = 25
247
+
248
+ # CASE 1: Both models detect threats
249
+ if phishing_pred == "Phishing" and malware_pred == "malicious":
250
+ # Use risk scores to determine primary threat type
251
+ if malware_risk >= HIGH_RISK and malware_risk > phishing_risk + 15:
252
+ return "Malicious", "Both models detected threat - malware characteristics dominant"
253
+ elif phishing_risk >= HIGH_RISK and phishing_risk > malware_risk + 15:
254
+ return "Phishing", "Both models detected threat - phishing characteristics dominant"
255
+ else:
256
+ # When both risks are similar, use model confidence (default to phishing for similar scores)
257
+ return "Phishing", "Both models detected threat - mixed characteristics favor phishing"
258
+
259
+ # CASE 2: Only malware model detects threat
260
+ elif malware_pred == "malicious" and phishing_pred != "Phishing":
261
+ if malware_risk >= MEDIUM_RISK:
262
+ return "Malicious", "Malware model detection confirmed by risk indicators"
263
+ elif phishing_risk >= HIGH_RISK:
264
+ return "Phishing", "Malware detected but phishing risk indicators dominant"
265
+ else:
266
+ return "Malicious", "Malware model detection (low confidence)"
267
+
268
+ # CASE 3: Only phishing model detects threat
269
+ elif phishing_pred == "Phishing" and malware_pred != "malicious":
270
+ if phishing_risk >= MEDIUM_RISK:
271
+ return "Phishing", "Phishing model detection confirmed by risk indicators"
272
+ elif malware_risk >= HIGH_RISK:
273
+ return "Malicious", "Phishing detected but malware risk indicators dominant"
274
+ else:
275
+ return "Phishing", "Phishing model detection (low confidence)"
276
+
277
+ # CASE 4: Both models report benign - Risk-based override
278
+ else:
279
+ if malware_risk >= HIGH_RISK and phishing_risk >= HIGH_RISK:
280
+ # Both risks high - choose based on which is higher
281
+ if malware_risk > phishing_risk:
282
+ return "Malicious", "Models missed threat - high malware risk detected"
283
+ else:
284
+ return "Phishing", "Models missed threat - high phishing risk detected"
285
+ elif malware_risk >= HIGH_RISK:
286
+ return "Malicious", "Models reported benign but high malware risk indicators"
287
+ elif phishing_risk >= HIGH_RISK:
288
+ return "Phishing", "Models reported benign but high phishing risk indicators"
289
+ elif malware_risk >= MEDIUM_RISK or phishing_risk >= MEDIUM_RISK:
290
+ return "Suspicious", "Models reported benign but moderate risk indicators present"
291
+ else:
292
+ return "Benign", "No threats detected by models or risk analysis"
293
+
294
  def analyze_url(url):
295
  try:
296
  # Extract features
 
305
  phishing_pred = phishing_model.predict(phishing_df)[0]
306
  malware_pred = malware_model.predict(malware_df)[0]
307
 
308
+ # Calculate enhanced risk scores
309
  phishing_risk = calculate_phishing_risk(phishing_features)
310
  malware_risk = calculate_malware_risk(malware_features)
311
 
312
+ # Get final prediction using logical truth table
313
+ final_result, decision_reason = get_final_prediction(
314
+ phishing_pred, malware_pred, phishing_risk, malware_risk
315
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  # Prepare detailed report
318
  report = {
319
  "url": url,
320
+ "final_result": final_result,
321
+ "decision_reason": decision_reason,
322
  "phishing": {
323
  "prediction": phishing_pred,
324
  "risk_score": phishing_risk,
 
327
  "is_shortened": bool(phishing_features['is_shortened']),
328
  "suspicious_tld": bool(phishing_features['suspicious_tld']),
329
  "suspicious_words": bool(phishing_features['has_suspicious_words']),
330
+ "path_keywords": phishing_features['path_keyword_count'],
331
+ "no_https": not bool(phishing_features['has_https'])
332
  }
333
  },
334
  "malware": {
 
338
  "has_ip": bool(malware_features['has_ip_address']),
339
  "is_shortened": bool(malware_features['is_shortened']),
340
  "suspicious_keywords": bool(malware_features['has_suspicious_keyword']),
341
+ "new_domain": 0 <= malware_features['domain_age_days'] < 30,
342
+ "low_ttl": 0 < malware_features['dns_ttl'] < 300,
343
+ "invalid_ssl": not bool(malware_features['ssl_valid']) and bool(malware_features['is_https'])
344
  }
345
+ },
346
+ "risk_analysis": {
347
+ "phishing_risk_level": "High" if phishing_risk >= 70 else "Medium" if phishing_risk >= 45 else "Low",
348
+ "malware_risk_level": "High" if malware_risk >= 70 else "Medium" if malware_risk >= 45 else "Low",
349
+ "confidence": "High" if abs(phishing_risk - malware_risk) > 25 else "Medium"
350
  }
351
  }
352
 
 
363
  if "error" in result:
364
  return f"❌ Error: {result['error']}"
365
 
366
+ # Format output with enhanced information
367
  output = f"""
368
+ 🔍 URL Analysis Report: {result['url']}
369
+ 🎯 Final Verdict: {result['final_result']}
370
+ 📌 Decision Logic: {result['decision_reason']}
371
+ 🔮 Analysis Confidence: {result['risk_analysis']['confidence']}
372
+
373
+ 🔒 Phishing Analysis:
374
+ - Model Prediction: {result['phishing']['prediction']}
375
+ - Risk Score: {result['phishing']['risk_score']}/100 ({result['risk_analysis']['phishing_risk_level']} Risk)
376
+ - Key Indicators:
377
+ IP Address: {result['phishing']['key_indicators']['has_ip']}
378
+ Shortened URL: {result['phishing']['key_indicators']['is_shortened']}
379
+ • Suspicious TLD: {result['phishing']['key_indicators']['suspicious_tld']}
380
+ Suspicious Words: {result['phishing']['key_indicators']['suspicious_words']}
381
+ • Path Keywords: {result['phishing']['key_indicators']['path_keywords']}
382
+ No HTTPS: {result['phishing']['key_indicators']['no_https']}
383
+
384
+ 🛡️ Malware Analysis:
385
+ - Model Prediction: {result['malware']['prediction']}
386
+ - Risk Score: {result['malware']['risk_score']}/100 ({result['risk_analysis']['malware_risk_level']} Risk)
387
+ - Key Indicators:
388
+ IP Address: {result['malware']['key_indicators']['has_ip']}
389
+ Shortened URL: {result['malware']['key_indicators']['is_shortened']}
390
+ Suspicious Keywords: {result['malware']['key_indicators']['suspicious_keywords']}
391
+ • New Domain (<30 days): {result['malware']['key_indicators']['new_domain']}
392
+ • Low DNS TTL: {result['malware']['key_indicators']['low_ttl']}
393
+ • Invalid SSL: {result['malware']['key_indicators']['invalid_ssl']}
394
+
395
+ 📊 Risk Comparison:
396
+ - Phishing Risk: {result['phishing']['risk_score']}/100
397
+ - Malware Risk: {result['malware']['risk_score']}/100
398
+ - Risk Difference: {abs(result['phishing']['risk_score'] - result['malware']['risk_score'])} points
399
  """
400
 
401
  return output
 
403
  demo = gr.Interface(
404
  fn=interface_fn,
405
  inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
406
+ outputs=gr.Textbox(label="Enhanced Threat Analysis Report", lines=25),
407
+ title="🛡 Advanced URL Threat Analyzer with Logical Truth Table",
408
+ description="Enhanced multi-layered detection system with logical decision matrix for 100% accurate classification",
409
  examples=[
410
  ["https://www.paypal-login-secure.com/verify"],
411
  ["https://free-movie-downloads.xyz/get.exe"],
412
  ["https://www.microsoft.com/en-us/"],
413
  ["http://192.168.1.100/install-update"],
414
+ ["https://secure-apple-id-confirm.com"],
415
+ ["https://bit.ly/malware-download"],
416
+ ["https://banking-update.tk/signin"]
417
  ],
418
  theme="soft"
419
  )