quantum-drive commited on
Commit
881125c
ยท
verified ยท
1 Parent(s): dd2c09d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -126
app.py CHANGED
@@ -161,135 +161,131 @@ def prepare_malware_input(url):
161
  return df
162
 
163
  # -------------------------------
164
- # ENHANCED RISK SCORING SYSTEM
165
  # -------------------------------
166
  def calculate_phishing_risk(features):
167
- """Calculate enhanced phishing risk score"""
168
  risk_score = 0
169
 
170
- # Critical indicators (High Weight)
171
  if features['has_ip_address']:
172
- risk_score += 35 # IP addresses are major red flag for phishing
173
- if features['is_shortened']:
174
- risk_score += 30 # URL shorteners commonly used in phishing
175
- if features['has_suspicious_words']:
176
- risk_score += 25 # Banking/login terms are key phishing indicators
177
 
178
- # Important indicators (Medium Weight)
179
- if features['suspicious_tld']:
180
- risk_score += 20 # Suspicious TLDs often used for phishing
181
- risk_score += min(features['path_keyword_count'] * 15, 30) # Keywords in path
182
- risk_score += min(features['query_keyword_count'] * 10, 20) # Keywords in query
183
 
184
- # Supporting indicators (Low Weight)
185
- risk_score += min(features['num_special_chars'] * 2, 15)
186
- risk_score += min(features['num_hyphens'] * 3, 15)
187
- if features['url_length'] > 75:
188
- risk_score += 10
189
- if not features['has_https']:
190
- risk_score += 5 # No HTTPS is suspicious for login pages
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  return min(risk_score, 100)
193
 
194
  def calculate_malware_risk(features):
195
- """Calculate enhanced malware risk score"""
196
  risk_score = 0
197
 
198
- # Critical indicators (High Weight)
199
  if features['has_ip_address']:
200
- risk_score += 35 # Direct IP access common in malware
201
- if features['has_suspicious_keyword']:
202
- risk_score += 30 # Download/crack keywords are major indicators
203
- if features['is_shortened']:
204
- risk_score += 25 # URL shorteners hide malicious destinations
205
 
206
- # Important indicators (Medium Weight)
207
- risk_score += min(features['path_keyword_count'] * 20, 40) # Malware keywords in path
 
 
 
208
 
209
- # Domain age indicators
210
- if 0 <= features['domain_age_days'] < 30:
211
- risk_score += 25 # Very new domains are highly suspicious
212
- elif 30 <= features['domain_age_days'] < 90:
213
- risk_score += 15 # New domains are suspicious
214
- elif features['domain_age_days'] > 365*15: # Very old domains can be compromised
215
- risk_score += 10
216
 
217
  # Network indicators
218
  if 0 < features['dns_ttl'] < 300:
219
- risk_score += 20 # Low TTL indicates fast-flux hosting
220
  if not features['ssl_valid'] and features['is_https']:
221
- risk_score += 15 # Invalid SSL certificate
 
 
222
 
223
- # Supporting indicators
224
- risk_score += min(features['num_special_chars'] * 2, 10)
225
- if features['url_length'] > 100:
226
- risk_score += 10
 
227
 
228
  return min(risk_score, 100)
229
 
230
  # -------------------------------
231
- # LOGICAL TRUTH TABLE DECISION SYSTEM
232
  # -------------------------------
233
  def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk):
234
  """
235
- Enhanced logical truth table for accurate classification
236
-
237
- Truth Table Logic:
238
- - Model predictions have primary weight
239
- - Risk scores provide secondary validation and override capability
240
- - Clear thresholds prevent misclassification
241
  """
242
 
243
- # Define risk thresholds
244
- HIGH_RISK = 70
245
- MEDIUM_RISK = 45
246
- LOW_RISK = 25
247
 
248
- # CASE 1: Both models detect threats
249
- if phishing_pred == "Phishing" and malware_pred == "malicious":
250
- # Use risk scores to determine primary threat type
251
- if malware_risk >= HIGH_RISK and malware_risk > phishing_risk + 15:
252
- return "Malicious", "Both models detected threat - malware characteristics dominant"
253
- elif phishing_risk >= HIGH_RISK and phishing_risk > malware_risk + 15:
254
- return "Phishing", "Both models detected threat - phishing characteristics dominant"
255
- else:
256
- # When both risks are similar, use model confidence (default to phishing for similar scores)
257
- return "Phishing", "Both models detected threat - mixed characteristics favor phishing"
258
 
259
- # CASE 2: Only malware model detects threat
260
- elif malware_pred == "malicious" and phishing_pred != "Phishing":
261
- if malware_risk >= MEDIUM_RISK:
262
- return "Malicious", "Malware model detection confirmed by risk indicators"
263
- elif phishing_risk >= HIGH_RISK:
264
- return "Phishing", "Malware detected but phishing risk indicators dominant"
 
265
  else:
266
- return "Malicious", "Malware model detection (low confidence)"
267
 
268
- # CASE 3: Only phishing model detects threat
269
- elif phishing_pred == "Phishing" and malware_pred != "malicious":
270
- if phishing_risk >= MEDIUM_RISK:
271
- return "Phishing", "Phishing model detection confirmed by risk indicators"
272
- elif malware_risk >= HIGH_RISK:
273
- return "Malicious", "Phishing detected but malware risk indicators dominant"
274
  else:
275
- return "Phishing", "Phishing model detection (low confidence)"
276
 
277
- # CASE 4: Both models report benign - Risk-based override
278
- else:
279
- if malware_risk >= HIGH_RISK and phishing_risk >= HIGH_RISK:
280
- # Both risks high - choose based on which is higher
281
- if malware_risk > phishing_risk:
282
- return "Malicious", "Models missed threat - high malware risk detected"
283
- else:
284
- return "Phishing", "Models missed threat - high phishing risk detected"
285
- elif malware_risk >= HIGH_RISK:
286
- return "Malicious", "Models reported benign but high malware risk indicators"
287
- elif phishing_risk >= HIGH_RISK:
288
- return "Phishing", "Models reported benign but high phishing risk indicators"
289
- elif malware_risk >= MEDIUM_RISK or phishing_risk >= MEDIUM_RISK:
290
- return "Suspicious", "Models reported benign but moderate risk indicators present"
291
  else:
292
- return "Benign", "No threats detected by models or risk analysis"
293
 
294
  def analyze_url(url):
295
  try:
@@ -305,11 +301,11 @@ def analyze_url(url):
305
  phishing_pred = phishing_model.predict(phishing_df)[0]
306
  malware_pred = malware_model.predict(malware_df)[0]
307
 
308
- # Calculate enhanced risk scores
309
  phishing_risk = calculate_phishing_risk(phishing_features)
310
  malware_risk = calculate_malware_risk(malware_features)
311
 
312
- # Get final prediction using logical truth table
313
  final_result, decision_reason = get_final_prediction(
314
  phishing_pred, malware_pred, phishing_risk, malware_risk
315
  )
@@ -344,9 +340,9 @@ def analyze_url(url):
344
  }
345
  },
346
  "risk_analysis": {
347
- "phishing_risk_level": "High" if phishing_risk >= 70 else "Medium" if phishing_risk >= 45 else "Low",
348
- "malware_risk_level": "High" if malware_risk >= 70 else "Medium" if malware_risk >= 45 else "Low",
349
- "confidence": "High" if abs(phishing_risk - malware_risk) > 25 else "Medium"
350
  }
351
  }
352
 
@@ -366,36 +362,40 @@ def interface_fn(url):
366
  # Format output with enhanced information
367
  output = f"""
368
  ๐Ÿ” URL Analysis Report: {result['url']}
369
- ๐ŸŽฏ Final Verdict: {result['final_result']}
 
370
  ๐Ÿ“Œ Decision Logic: {result['decision_reason']}
371
  ๐Ÿ”ฎ Analysis Confidence: {result['risk_analysis']['confidence']}
372
 
373
- ๐Ÿ”’ Phishing Analysis:
374
- - Model Prediction: {result['phishing']['prediction']}
375
- - Risk Score: {result['phishing']['risk_score']}/100 ({result['risk_analysis']['phishing_risk_level']} Risk)
376
- - Key Indicators:
377
- โ€ข IP Address: {result['phishing']['key_indicators']['has_ip']}
378
- โ€ข Shortened URL: {result['phishing']['key_indicators']['is_shortened']}
379
- โ€ข Suspicious TLD: {result['phishing']['key_indicators']['suspicious_tld']}
380
- โ€ข Suspicious Words: {result['phishing']['key_indicators']['suspicious_words']}
381
- โ€ข Path Keywords: {result['phishing']['key_indicators']['path_keywords']}
382
- โ€ข No HTTPS: {result['phishing']['key_indicators']['no_https']}
 
 
383
 
384
- ๐Ÿ›ก๏ธ Malware Analysis:
385
- - Model Prediction: {result['malware']['prediction']}
386
- - Risk Score: {result['malware']['risk_score']}/100 ({result['risk_analysis']['malware_risk_level']} Risk)
387
- - Key Indicators:
388
- โ€ข IP Address: {result['malware']['key_indicators']['has_ip']}
389
- โ€ข Shortened URL: {result['malware']['key_indicators']['is_shortened']}
390
- โ€ข Suspicious Keywords: {result['malware']['key_indicators']['suspicious_keywords']}
391
- โ€ข New Domain (<30 days): {result['malware']['key_indicators']['new_domain']}
392
- โ€ข Low DNS TTL: {result['malware']['key_indicators']['low_ttl']}
393
- โ€ข Invalid SSL: {result['malware']['key_indicators']['invalid_ssl']}
394
 
395
- ๐Ÿ“Š Risk Comparison:
396
- - Phishing Risk: {result['phishing']['risk_score']}/100
397
- - Malware Risk: {result['malware']['risk_score']}/100
398
- - Risk Difference: {abs(result['phishing']['risk_score'] - result['malware']['risk_score'])} points
 
399
  """
400
 
401
  return output
@@ -403,13 +403,15 @@ def interface_fn(url):
403
  demo = gr.Interface(
404
  fn=interface_fn,
405
  inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
406
- outputs=gr.Textbox(label="Enhanced Threat Analysis Report", lines=25),
407
- title="๐Ÿ›ก๏ธ Advanced URL Threat Analyzer with Logical Truth Table",
408
- description="Enhanced multi-layered detection system with logical decision matrix for 100% accurate classification",
409
  examples=[
 
 
 
410
  ["https://www.paypal-login-secure.com/verify"],
411
  ["https://free-movie-downloads.xyz/get.exe"],
412
- ["https://www.microsoft.com/en-us/"],
413
  ["http://192.168.1.100/install-update"],
414
  ["https://secure-apple-id-confirm.com"],
415
  ["https://bit.ly/malware-download"],
 
161
  return df
162
 
163
  # -------------------------------
164
+ # REFINED RISK SCORING SYSTEM
165
  # -------------------------------
166
  def calculate_phishing_risk(features):
167
+ """Calculate refined phishing risk score with better thresholds"""
168
  risk_score = 0
169
 
170
+ # Critical indicators - only for clearly suspicious cases
171
  if features['has_ip_address']:
172
+ risk_score += 40 # Direct IP is major red flag
173
+ if features['is_shortened'] and features['has_suspicious_words']:
174
+ risk_score += 35 # Shortened URL with suspicious words
175
+ elif features['is_shortened']:
176
+ risk_score += 15 # Shortened URL alone is less suspicious
177
 
178
+ # Phishing-specific indicators
179
+ if features['has_suspicious_words'] and features['suspicious_tld']:
180
+ risk_score += 30 # Banking terms + suspicious TLD
181
+ elif features['has_suspicious_words']:
182
+ risk_score += 10 # Banking terms alone (could be legitimate)
183
 
184
+ # Domain and structure indicators
185
+ if features['suspicious_tld'] and features['num_hyphens'] > 2:
186
+ risk_score += 25 # Suspicious TLD with many hyphens
187
+ elif features['suspicious_tld']:
188
+ risk_score += 10 # Suspicious TLD alone
189
+
190
+ # Multiple suspicious indicators together
191
+ if features['path_keyword_count'] > 1 and features['query_keyword_count'] > 0:
192
+ risk_score += 20
193
+ elif features['path_keyword_count'] > 0:
194
+ risk_score += 8
195
+
196
+ # Length and special character penalties (reduced)
197
+ if features['url_length'] > 100:
198
+ risk_score += 8
199
+ if features['num_special_chars'] > 10:
200
+ risk_score += 5
201
+ if features['num_hyphens'] > 3:
202
+ risk_score += 5
203
 
204
  return min(risk_score, 100)
205
 
206
  def calculate_malware_risk(features):
207
+ """Calculate refined malware risk score with better thresholds"""
208
  risk_score = 0
209
 
210
+ # Critical indicators - only for clearly suspicious cases
211
  if features['has_ip_address']:
212
+ risk_score += 40 # Direct IP access
213
+ if features['has_suspicious_keyword'] and features['is_shortened']:
214
+ risk_score += 35 # Malware keywords + shortened URL
215
+ elif features['has_suspicious_keyword']:
216
+ risk_score += 15 # Malware keywords alone
217
 
218
+ # Path-based malware indicators
219
+ if features['path_keyword_count'] > 2:
220
+ risk_score += 30 # Multiple malware keywords in path
221
+ elif features['path_keyword_count'] > 0:
222
+ risk_score += 12
223
 
224
+ # Domain age indicators (refined)
225
+ if 0 <= features['domain_age_days'] < 7:
226
+ risk_score += 30 # Very new domains (1 week)
227
+ elif 7 <= features['domain_age_days'] < 30:
228
+ risk_score += 15 # New domains (1 month)
229
+ elif features['domain_age_days'] > 365*20: # Very old compromised domains
230
+ risk_score += 8
231
 
232
  # Network indicators
233
  if 0 < features['dns_ttl'] < 300:
234
+ risk_score += 25 # Low TTL indicates fast-flux hosting
235
  if not features['ssl_valid'] and features['is_https']:
236
+ risk_score += 20 # Invalid SSL certificate
237
+ elif not features['is_https'] and features['has_suspicious_keyword']:
238
+ risk_score += 15 # No HTTPS with malware keywords
239
 
240
+ # Supporting indicators (reduced impact)
241
+ if features['url_length'] > 120:
242
+ risk_score += 8
243
+ if features['num_special_chars'] > 15:
244
+ risk_score += 5
245
 
246
  return min(risk_score, 100)
247
 
248
  # -------------------------------
249
+ # SIMPLE RISK-BASED DECISION SYSTEM
250
  # -------------------------------
251
  def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk):
252
  """
253
+ Simple risk-based decision system:
254
+ 1. Compare risk scores directly
255
+ 2. Use higher risk score for final prediction
256
+ 3. Add safety thresholds for benign classification
 
 
257
  """
258
 
259
+ # Safety thresholds
260
+ THREAT_THRESHOLD = 25 # Minimum score to consider as threat
261
+ HIGH_CONFIDENCE_THRESHOLD = 15 # Risk difference for high confidence
 
262
 
263
+ # Case 1: Both risks are very low - definitely benign
264
+ if phishing_risk < THREAT_THRESHOLD and malware_risk < THREAT_THRESHOLD:
265
+ return "Benign", f"Low risk scores (Phishing: {phishing_risk}, Malware: {malware_risk})"
 
 
 
 
 
 
 
266
 
267
+ # Case 2: One or both risks are above threshold
268
+ risk_difference = abs(phishing_risk - malware_risk)
269
+
270
+ if phishing_risk > malware_risk:
271
+ if phishing_risk >= THREAT_THRESHOLD:
272
+ confidence = "High" if risk_difference >= HIGH_CONFIDENCE_THRESHOLD else "Medium"
273
+ return "Phishing", f"Phishing risk higher ({phishing_risk} vs {malware_risk}) - {confidence} confidence"
274
  else:
275
+ return "Benign", f"Phishing risk slightly higher but below threshold ({phishing_risk})"
276
 
277
+ elif malware_risk > phishing_risk:
278
+ if malware_risk >= THREAT_THRESHOLD:
279
+ confidence = "High" if risk_difference >= HIGH_CONFIDENCE_THRESHOLD else "Medium"
280
+ return "Malicious", f"Malware risk higher ({malware_risk} vs {phishing_risk}) - {confidence} confidence"
 
 
281
  else:
282
+ return "Benign", f"Malware risk slightly higher but below threshold ({malware_risk})"
283
 
284
+ else: # Equal risks
285
+ if phishing_risk >= THREAT_THRESHOLD:
286
+ return "Suspicious", f"Equal risk scores ({phishing_risk}) - requires manual review"
 
 
 
 
 
 
 
 
 
 
 
287
  else:
288
+ return "Benign", f"Equal low risk scores ({phishing_risk})"
289
 
290
  def analyze_url(url):
291
  try:
 
301
  phishing_pred = phishing_model.predict(phishing_df)[0]
302
  malware_pred = malware_model.predict(malware_df)[0]
303
 
304
+ # Calculate refined risk scores
305
  phishing_risk = calculate_phishing_risk(phishing_features)
306
  malware_risk = calculate_malware_risk(malware_features)
307
 
308
+ # Get final prediction using simple risk-based system
309
  final_result, decision_reason = get_final_prediction(
310
  phishing_pred, malware_pred, phishing_risk, malware_risk
311
  )
 
340
  }
341
  },
342
  "risk_analysis": {
343
+ "phishing_risk_level": "High" if phishing_risk >= 60 else "Medium" if phishing_risk >= 25 else "Low",
344
+ "malware_risk_level": "High" if malware_risk >= 60 else "Medium" if malware_risk >= 25 else "Low",
345
+ "confidence": "High" if abs(phishing_risk - malware_risk) >= 15 else "Medium"
346
  }
347
  }
348
 
 
362
  # Format output with enhanced information
363
  output = f"""
364
  ๐Ÿ” URL Analysis Report: {result['url']}
365
+
366
+ ๐ŸŽฏ FINAL VERDICT: {result['final_result']}
367
  ๐Ÿ“Œ Decision Logic: {result['decision_reason']}
368
  ๐Ÿ”ฎ Analysis Confidence: {result['risk_analysis']['confidence']}
369
 
370
+ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
371
+
372
+ ๐Ÿ”’ PHISHING ANALYSIS:
373
+ โ€ข Model Prediction: {result['phishing']['prediction']}
374
+ โ€ข Risk Score: {result['phishing']['risk_score']}/100 ({result['risk_analysis']['phishing_risk_level']} Risk)
375
+ โ€ข Key Indicators:
376
+ - IP Address: {result['phishing']['key_indicators']['has_ip']}
377
+ - Shortened URL: {result['phishing']['key_indicators']['is_shortened']}
378
+ - Suspicious TLD: {result['phishing']['key_indicators']['suspicious_tld']}
379
+ - Suspicious Words: {result['phishing']['key_indicators']['suspicious_words']}
380
+ - Path Keywords: {result['phishing']['key_indicators']['path_keywords']}
381
+ - No HTTPS: {result['phishing']['key_indicators']['no_https']}
382
 
383
+ ๐Ÿ›ก๏ธ MALWARE ANALYSIS:
384
+ โ€ข Model Prediction: {result['malware']['prediction']}
385
+ โ€ข Risk Score: {result['malware']['risk_score']}/100 ({result['risk_analysis']['malware_risk_level']} Risk)
386
+ โ€ข Key Indicators:
387
+ - IP Address: {result['malware']['key_indicators']['has_ip']}
388
+ - Shortened URL: {result['malware']['key_indicators']['is_shortened']}
389
+ - Suspicious Keywords: {result['malware']['key_indicators']['suspicious_keywords']}
390
+ - New Domain (<30 days): {result['malware']['key_indicators']['new_domain']}
391
+ - Low DNS TTL: {result['malware']['key_indicators']['low_ttl']}
392
+ - Invalid SSL: {result['malware']['key_indicators']['invalid_ssl']}
393
 
394
+ ๐Ÿ“Š RISK COMPARISON:
395
+ โ€ข Phishing Risk: {result['phishing']['risk_score']}/100
396
+ โ€ข Malware Risk: {result['malware']['risk_score']}/100
397
+ โ€ข Risk Difference: {abs(result['phishing']['risk_score'] - result['malware']['risk_score'])} points
398
+ โ€ข Winner: {"Phishing" if result['phishing']['risk_score'] > result['malware']['risk_score'] else "Malware" if result['malware']['risk_score'] > result['phishing']['risk_score'] else "Equal"}
399
  """
400
 
401
  return output
 
403
  demo = gr.Interface(
404
  fn=interface_fn,
405
  inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
406
+ outputs=gr.Textbox(label="๐Ÿ›ก๏ธ Simple Risk-Based Threat Analysis", lines=30),
407
+ title="๐Ÿ›ก๏ธ Fixed URL Threat Analyzer - Risk Score Based",
408
+ description="Simple and accurate threat detection based on risk scores. Higher risk score wins!",
409
  examples=[
410
+ ["https://www.google.com"],
411
+ ["https://www.facebook.com"],
412
+ ["https://www.microsoft.com/en-us/"],
413
  ["https://www.paypal-login-secure.com/verify"],
414
  ["https://free-movie-downloads.xyz/get.exe"],
 
415
  ["http://192.168.1.100/install-update"],
416
  ["https://secure-apple-id-confirm.com"],
417
  ["https://bit.ly/malware-download"],