Update app.py
Browse files
app.py
CHANGED
|
@@ -161,135 +161,131 @@ def prepare_malware_input(url):
|
|
| 161 |
return df
|
| 162 |
|
| 163 |
# -------------------------------
|
| 164 |
-
#
|
| 165 |
# -------------------------------
|
| 166 |
def calculate_phishing_risk(features):
|
| 167 |
-
"""Calculate
|
| 168 |
risk_score = 0
|
| 169 |
|
| 170 |
-
# Critical indicators
|
| 171 |
if features['has_ip_address']:
|
| 172 |
-
risk_score +=
|
| 173 |
-
if features['is_shortened']:
|
| 174 |
-
risk_score +=
|
| 175 |
-
|
| 176 |
-
risk_score +=
|
| 177 |
|
| 178 |
-
#
|
| 179 |
-
if features['suspicious_tld']:
|
| 180 |
-
risk_score +=
|
| 181 |
-
|
| 182 |
-
|
| 183 |
|
| 184 |
-
#
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
risk_score += 10
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
return min(risk_score, 100)
|
| 193 |
|
| 194 |
def calculate_malware_risk(features):
|
| 195 |
-
"""Calculate
|
| 196 |
risk_score = 0
|
| 197 |
|
| 198 |
-
# Critical indicators
|
| 199 |
if features['has_ip_address']:
|
| 200 |
-
risk_score +=
|
| 201 |
-
if features['has_suspicious_keyword']:
|
| 202 |
-
risk_score +=
|
| 203 |
-
|
| 204 |
-
risk_score +=
|
| 205 |
|
| 206 |
-
#
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
-
# Domain age indicators
|
| 210 |
-
if 0 <= features['domain_age_days'] <
|
| 211 |
-
risk_score +=
|
| 212 |
-
elif
|
| 213 |
-
risk_score += 15 # New domains
|
| 214 |
-
elif features['domain_age_days'] > 365*
|
| 215 |
-
risk_score +=
|
| 216 |
|
| 217 |
# Network indicators
|
| 218 |
if 0 < features['dns_ttl'] < 300:
|
| 219 |
-
risk_score +=
|
| 220 |
if not features['ssl_valid'] and features['is_https']:
|
| 221 |
-
risk_score +=
|
|
|
|
|
|
|
| 222 |
|
| 223 |
-
# Supporting indicators
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
|
|
|
| 227 |
|
| 228 |
return min(risk_score, 100)
|
| 229 |
|
| 230 |
# -------------------------------
|
| 231 |
-
#
|
| 232 |
# -------------------------------
|
| 233 |
def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk):
|
| 234 |
"""
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
- Risk scores provide secondary validation and override capability
|
| 240 |
-
- Clear thresholds prevent misclassification
|
| 241 |
"""
|
| 242 |
|
| 243 |
-
#
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
LOW_RISK = 25
|
| 247 |
|
| 248 |
-
#
|
| 249 |
-
if
|
| 250 |
-
|
| 251 |
-
if malware_risk >= HIGH_RISK and malware_risk > phishing_risk + 15:
|
| 252 |
-
return "Malicious", "Both models detected threat - malware characteristics dominant"
|
| 253 |
-
elif phishing_risk >= HIGH_RISK and phishing_risk > malware_risk + 15:
|
| 254 |
-
return "Phishing", "Both models detected threat - phishing characteristics dominant"
|
| 255 |
-
else:
|
| 256 |
-
# When both risks are similar, use model confidence (default to phishing for similar scores)
|
| 257 |
-
return "Phishing", "Both models detected threat - mixed characteristics favor phishing"
|
| 258 |
|
| 259 |
-
#
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
| 265 |
else:
|
| 266 |
-
return "
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
return "
|
| 272 |
-
elif malware_risk >= HIGH_RISK:
|
| 273 |
-
return "Malicious", "Phishing detected but malware risk indicators dominant"
|
| 274 |
else:
|
| 275 |
-
return "
|
| 276 |
|
| 277 |
-
#
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
# Both risks high - choose based on which is higher
|
| 281 |
-
if malware_risk > phishing_risk:
|
| 282 |
-
return "Malicious", "Models missed threat - high malware risk detected"
|
| 283 |
-
else:
|
| 284 |
-
return "Phishing", "Models missed threat - high phishing risk detected"
|
| 285 |
-
elif malware_risk >= HIGH_RISK:
|
| 286 |
-
return "Malicious", "Models reported benign but high malware risk indicators"
|
| 287 |
-
elif phishing_risk >= HIGH_RISK:
|
| 288 |
-
return "Phishing", "Models reported benign but high phishing risk indicators"
|
| 289 |
-
elif malware_risk >= MEDIUM_RISK or phishing_risk >= MEDIUM_RISK:
|
| 290 |
-
return "Suspicious", "Models reported benign but moderate risk indicators present"
|
| 291 |
else:
|
| 292 |
-
return "Benign", "
|
| 293 |
|
| 294 |
def analyze_url(url):
|
| 295 |
try:
|
|
@@ -305,11 +301,11 @@ def analyze_url(url):
|
|
| 305 |
phishing_pred = phishing_model.predict(phishing_df)[0]
|
| 306 |
malware_pred = malware_model.predict(malware_df)[0]
|
| 307 |
|
| 308 |
-
# Calculate
|
| 309 |
phishing_risk = calculate_phishing_risk(phishing_features)
|
| 310 |
malware_risk = calculate_malware_risk(malware_features)
|
| 311 |
|
| 312 |
-
# Get final prediction using
|
| 313 |
final_result, decision_reason = get_final_prediction(
|
| 314 |
phishing_pred, malware_pred, phishing_risk, malware_risk
|
| 315 |
)
|
|
@@ -344,9 +340,9 @@ def analyze_url(url):
|
|
| 344 |
}
|
| 345 |
},
|
| 346 |
"risk_analysis": {
|
| 347 |
-
"phishing_risk_level": "High" if phishing_risk >=
|
| 348 |
-
"malware_risk_level": "High" if malware_risk >=
|
| 349 |
-
"confidence": "High" if abs(phishing_risk - malware_risk) >
|
| 350 |
}
|
| 351 |
}
|
| 352 |
|
|
@@ -366,36 +362,40 @@ def interface_fn(url):
|
|
| 366 |
# Format output with enhanced information
|
| 367 |
output = f"""
|
| 368 |
๐ URL Analysis Report: {result['url']}
|
| 369 |
-
|
|
|
|
| 370 |
๐ Decision Logic: {result['decision_reason']}
|
| 371 |
๐ฎ Analysis Confidence: {result['risk_analysis']['confidence']}
|
| 372 |
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
|
|
|
|
|
|
| 383 |
|
| 384 |
-
๐ก๏ธ
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
|
| 395 |
-
๐
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
| 399 |
"""
|
| 400 |
|
| 401 |
return output
|
|
@@ -403,13 +403,15 @@ def interface_fn(url):
|
|
| 403 |
demo = gr.Interface(
|
| 404 |
fn=interface_fn,
|
| 405 |
inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
|
| 406 |
-
outputs=gr.Textbox(label="
|
| 407 |
-
title="๐ก๏ธ
|
| 408 |
-
description="
|
| 409 |
examples=[
|
|
|
|
|
|
|
|
|
|
| 410 |
["https://www.paypal-login-secure.com/verify"],
|
| 411 |
["https://free-movie-downloads.xyz/get.exe"],
|
| 412 |
-
["https://www.microsoft.com/en-us/"],
|
| 413 |
["http://192.168.1.100/install-update"],
|
| 414 |
["https://secure-apple-id-confirm.com"],
|
| 415 |
["https://bit.ly/malware-download"],
|
|
|
|
| 161 |
return df
|
| 162 |
|
| 163 |
# -------------------------------
|
| 164 |
+
# REFINED RISK SCORING SYSTEM
|
| 165 |
# -------------------------------
|
| 166 |
def calculate_phishing_risk(features):
|
| 167 |
+
"""Calculate refined phishing risk score with better thresholds"""
|
| 168 |
risk_score = 0
|
| 169 |
|
| 170 |
+
# Critical indicators - only for clearly suspicious cases
|
| 171 |
if features['has_ip_address']:
|
| 172 |
+
risk_score += 40 # Direct IP is major red flag
|
| 173 |
+
if features['is_shortened'] and features['has_suspicious_words']:
|
| 174 |
+
risk_score += 35 # Shortened URL with suspicious words
|
| 175 |
+
elif features['is_shortened']:
|
| 176 |
+
risk_score += 15 # Shortened URL alone is less suspicious
|
| 177 |
|
| 178 |
+
# Phishing-specific indicators
|
| 179 |
+
if features['has_suspicious_words'] and features['suspicious_tld']:
|
| 180 |
+
risk_score += 30 # Banking terms + suspicious TLD
|
| 181 |
+
elif features['has_suspicious_words']:
|
| 182 |
+
risk_score += 10 # Banking terms alone (could be legitimate)
|
| 183 |
|
| 184 |
+
# Domain and structure indicators
|
| 185 |
+
if features['suspicious_tld'] and features['num_hyphens'] > 2:
|
| 186 |
+
risk_score += 25 # Suspicious TLD with many hyphens
|
| 187 |
+
elif features['suspicious_tld']:
|
| 188 |
+
risk_score += 10 # Suspicious TLD alone
|
| 189 |
+
|
| 190 |
+
# Multiple suspicious indicators together
|
| 191 |
+
if features['path_keyword_count'] > 1 and features['query_keyword_count'] > 0:
|
| 192 |
+
risk_score += 20
|
| 193 |
+
elif features['path_keyword_count'] > 0:
|
| 194 |
+
risk_score += 8
|
| 195 |
+
|
| 196 |
+
# Length and special character penalties (reduced)
|
| 197 |
+
if features['url_length'] > 100:
|
| 198 |
+
risk_score += 8
|
| 199 |
+
if features['num_special_chars'] > 10:
|
| 200 |
+
risk_score += 5
|
| 201 |
+
if features['num_hyphens'] > 3:
|
| 202 |
+
risk_score += 5
|
| 203 |
|
| 204 |
return min(risk_score, 100)
|
| 205 |
|
| 206 |
def calculate_malware_risk(features):
|
| 207 |
+
"""Calculate refined malware risk score with better thresholds"""
|
| 208 |
risk_score = 0
|
| 209 |
|
| 210 |
+
# Critical indicators - only for clearly suspicious cases
|
| 211 |
if features['has_ip_address']:
|
| 212 |
+
risk_score += 40 # Direct IP access
|
| 213 |
+
if features['has_suspicious_keyword'] and features['is_shortened']:
|
| 214 |
+
risk_score += 35 # Malware keywords + shortened URL
|
| 215 |
+
elif features['has_suspicious_keyword']:
|
| 216 |
+
risk_score += 15 # Malware keywords alone
|
| 217 |
|
| 218 |
+
# Path-based malware indicators
|
| 219 |
+
if features['path_keyword_count'] > 2:
|
| 220 |
+
risk_score += 30 # Multiple malware keywords in path
|
| 221 |
+
elif features['path_keyword_count'] > 0:
|
| 222 |
+
risk_score += 12
|
| 223 |
|
| 224 |
+
# Domain age indicators (refined)
|
| 225 |
+
if 0 <= features['domain_age_days'] < 7:
|
| 226 |
+
risk_score += 30 # Very new domains (1 week)
|
| 227 |
+
elif 7 <= features['domain_age_days'] < 30:
|
| 228 |
+
risk_score += 15 # New domains (1 month)
|
| 229 |
+
elif features['domain_age_days'] > 365*20: # Very old compromised domains
|
| 230 |
+
risk_score += 8
|
| 231 |
|
| 232 |
# Network indicators
|
| 233 |
if 0 < features['dns_ttl'] < 300:
|
| 234 |
+
risk_score += 25 # Low TTL indicates fast-flux hosting
|
| 235 |
if not features['ssl_valid'] and features['is_https']:
|
| 236 |
+
risk_score += 20 # Invalid SSL certificate
|
| 237 |
+
elif not features['is_https'] and features['has_suspicious_keyword']:
|
| 238 |
+
risk_score += 15 # No HTTPS with malware keywords
|
| 239 |
|
| 240 |
+
# Supporting indicators (reduced impact)
|
| 241 |
+
if features['url_length'] > 120:
|
| 242 |
+
risk_score += 8
|
| 243 |
+
if features['num_special_chars'] > 15:
|
| 244 |
+
risk_score += 5
|
| 245 |
|
| 246 |
return min(risk_score, 100)
|
| 247 |
|
| 248 |
# -------------------------------
|
| 249 |
+
# SIMPLE RISK-BASED DECISION SYSTEM
|
| 250 |
# -------------------------------
|
| 251 |
def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk):
|
| 252 |
"""
|
| 253 |
+
Simple risk-based decision system:
|
| 254 |
+
1. Compare risk scores directly
|
| 255 |
+
2. Use higher risk score for final prediction
|
| 256 |
+
3. Add safety thresholds for benign classification
|
|
|
|
|
|
|
| 257 |
"""
|
| 258 |
|
| 259 |
+
# Safety thresholds
|
| 260 |
+
THREAT_THRESHOLD = 25 # Minimum score to consider as threat
|
| 261 |
+
HIGH_CONFIDENCE_THRESHOLD = 15 # Risk difference for high confidence
|
|
|
|
| 262 |
|
| 263 |
+
# Case 1: Both risks are very low - definitely benign
|
| 264 |
+
if phishing_risk < THREAT_THRESHOLD and malware_risk < THREAT_THRESHOLD:
|
| 265 |
+
return "Benign", f"Low risk scores (Phishing: {phishing_risk}, Malware: {malware_risk})"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
+
# Case 2: One or both risks are above threshold
|
| 268 |
+
risk_difference = abs(phishing_risk - malware_risk)
|
| 269 |
+
|
| 270 |
+
if phishing_risk > malware_risk:
|
| 271 |
+
if phishing_risk >= THREAT_THRESHOLD:
|
| 272 |
+
confidence = "High" if risk_difference >= HIGH_CONFIDENCE_THRESHOLD else "Medium"
|
| 273 |
+
return "Phishing", f"Phishing risk higher ({phishing_risk} vs {malware_risk}) - {confidence} confidence"
|
| 274 |
else:
|
| 275 |
+
return "Benign", f"Phishing risk slightly higher but below threshold ({phishing_risk})"
|
| 276 |
|
| 277 |
+
elif malware_risk > phishing_risk:
|
| 278 |
+
if malware_risk >= THREAT_THRESHOLD:
|
| 279 |
+
confidence = "High" if risk_difference >= HIGH_CONFIDENCE_THRESHOLD else "Medium"
|
| 280 |
+
return "Malicious", f"Malware risk higher ({malware_risk} vs {phishing_risk}) - {confidence} confidence"
|
|
|
|
|
|
|
| 281 |
else:
|
| 282 |
+
return "Benign", f"Malware risk slightly higher but below threshold ({malware_risk})"
|
| 283 |
|
| 284 |
+
else: # Equal risks
|
| 285 |
+
if phishing_risk >= THREAT_THRESHOLD:
|
| 286 |
+
return "Suspicious", f"Equal risk scores ({phishing_risk}) - requires manual review"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
else:
|
| 288 |
+
return "Benign", f"Equal low risk scores ({phishing_risk})"
|
| 289 |
|
| 290 |
def analyze_url(url):
|
| 291 |
try:
|
|
|
|
| 301 |
phishing_pred = phishing_model.predict(phishing_df)[0]
|
| 302 |
malware_pred = malware_model.predict(malware_df)[0]
|
| 303 |
|
| 304 |
+
# Calculate refined risk scores
|
| 305 |
phishing_risk = calculate_phishing_risk(phishing_features)
|
| 306 |
malware_risk = calculate_malware_risk(malware_features)
|
| 307 |
|
| 308 |
+
# Get final prediction using simple risk-based system
|
| 309 |
final_result, decision_reason = get_final_prediction(
|
| 310 |
phishing_pred, malware_pred, phishing_risk, malware_risk
|
| 311 |
)
|
|
|
|
| 340 |
}
|
| 341 |
},
|
| 342 |
"risk_analysis": {
|
| 343 |
+
"phishing_risk_level": "High" if phishing_risk >= 60 else "Medium" if phishing_risk >= 25 else "Low",
|
| 344 |
+
"malware_risk_level": "High" if malware_risk >= 60 else "Medium" if malware_risk >= 25 else "Low",
|
| 345 |
+
"confidence": "High" if abs(phishing_risk - malware_risk) >= 15 else "Medium"
|
| 346 |
}
|
| 347 |
}
|
| 348 |
|
|
|
|
| 362 |
# Format output with enhanced information
|
| 363 |
output = f"""
|
| 364 |
๐ URL Analysis Report: {result['url']}
|
| 365 |
+
|
| 366 |
+
๐ฏ FINAL VERDICT: {result['final_result']}
|
| 367 |
๐ Decision Logic: {result['decision_reason']}
|
| 368 |
๐ฎ Analysis Confidence: {result['risk_analysis']['confidence']}
|
| 369 |
|
| 370 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 371 |
+
|
| 372 |
+
๐ PHISHING ANALYSIS:
|
| 373 |
+
โข Model Prediction: {result['phishing']['prediction']}
|
| 374 |
+
โข Risk Score: {result['phishing']['risk_score']}/100 ({result['risk_analysis']['phishing_risk_level']} Risk)
|
| 375 |
+
โข Key Indicators:
|
| 376 |
+
- IP Address: {result['phishing']['key_indicators']['has_ip']}
|
| 377 |
+
- Shortened URL: {result['phishing']['key_indicators']['is_shortened']}
|
| 378 |
+
- Suspicious TLD: {result['phishing']['key_indicators']['suspicious_tld']}
|
| 379 |
+
- Suspicious Words: {result['phishing']['key_indicators']['suspicious_words']}
|
| 380 |
+
- Path Keywords: {result['phishing']['key_indicators']['path_keywords']}
|
| 381 |
+
- No HTTPS: {result['phishing']['key_indicators']['no_https']}
|
| 382 |
|
| 383 |
+
๐ก๏ธ MALWARE ANALYSIS:
|
| 384 |
+
โข Model Prediction: {result['malware']['prediction']}
|
| 385 |
+
โข Risk Score: {result['malware']['risk_score']}/100 ({result['risk_analysis']['malware_risk_level']} Risk)
|
| 386 |
+
โข Key Indicators:
|
| 387 |
+
- IP Address: {result['malware']['key_indicators']['has_ip']}
|
| 388 |
+
- Shortened URL: {result['malware']['key_indicators']['is_shortened']}
|
| 389 |
+
- Suspicious Keywords: {result['malware']['key_indicators']['suspicious_keywords']}
|
| 390 |
+
- New Domain (<30 days): {result['malware']['key_indicators']['new_domain']}
|
| 391 |
+
- Low DNS TTL: {result['malware']['key_indicators']['low_ttl']}
|
| 392 |
+
- Invalid SSL: {result['malware']['key_indicators']['invalid_ssl']}
|
| 393 |
|
| 394 |
+
๐ RISK COMPARISON:
|
| 395 |
+
โข Phishing Risk: {result['phishing']['risk_score']}/100
|
| 396 |
+
โข Malware Risk: {result['malware']['risk_score']}/100
|
| 397 |
+
โข Risk Difference: {abs(result['phishing']['risk_score'] - result['malware']['risk_score'])} points
|
| 398 |
+
โข Winner: {"Phishing" if result['phishing']['risk_score'] > result['malware']['risk_score'] else "Malware" if result['malware']['risk_score'] > result['phishing']['risk_score'] else "Equal"}
|
| 399 |
"""
|
| 400 |
|
| 401 |
return output
|
|
|
|
| 403 |
demo = gr.Interface(
|
| 404 |
fn=interface_fn,
|
| 405 |
inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
|
| 406 |
+
outputs=gr.Textbox(label="๐ก๏ธ Simple Risk-Based Threat Analysis", lines=30),
|
| 407 |
+
title="๐ก๏ธ Fixed URL Threat Analyzer - Risk Score Based",
|
| 408 |
+
description="Simple and accurate threat detection based on risk scores. Higher risk score wins!",
|
| 409 |
examples=[
|
| 410 |
+
["https://www.google.com"],
|
| 411 |
+
["https://www.facebook.com"],
|
| 412 |
+
["https://www.microsoft.com/en-us/"],
|
| 413 |
["https://www.paypal-login-secure.com/verify"],
|
| 414 |
["https://free-movie-downloads.xyz/get.exe"],
|
|
|
|
| 415 |
["http://192.168.1.100/install-update"],
|
| 416 |
["https://secure-apple-id-confirm.com"],
|
| 417 |
["https://bit.ly/malware-download"],
|