Spaces:

quantum-drive
/

malware-phishing-detection

Sleeping

App Files Files Community

quantum-drive commited on May 30, 2025

Commit

881125c

verified ·

1 Parent(s): dd2c09d

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -126

app.py CHANGED Viewed

@@ -161,135 +161,131 @@ def prepare_malware_input(url):
     return df
 # -------------------------------
-# ENHANCED RISK SCORING SYSTEM
 # -------------------------------
 def calculate_phishing_risk(features):
-    """Calculate enhanced phishing risk score"""
     risk_score = 0
-    # Critical indicators (High Weight)
     if features['has_ip_address']:
-        risk_score += 35  # IP addresses are major red flag for phishing
-    if features['is_shortened']:
-        risk_score += 30  # URL shorteners commonly used in phishing
-    if features['has_suspicious_words']:
-        risk_score += 25  # Banking/login terms are key phishing indicators
-    # Important indicators (Medium Weight)
-    if features['suspicious_tld']:
-        risk_score += 20  # Suspicious TLDs often used for phishing
-    risk_score += min(features['path_keyword_count'] * 15, 30)  # Keywords in path
-    risk_score += min(features['query_keyword_count'] * 10, 20)  # Keywords in query
-    # Supporting indicators (Low Weight)
-    risk_score += min(features['num_special_chars'] * 2, 15)
-    risk_score += min(features['num_hyphens'] * 3, 15)
-    if features['url_length'] > 75:
-        risk_score += 10
-    if not features['has_https']:
-        risk_score += 5  # No HTTPS is suspicious for login pages
     return min(risk_score, 100)
 def calculate_malware_risk(features):
-    """Calculate enhanced malware risk score"""
     risk_score = 0
-    # Critical indicators (High Weight)
     if features['has_ip_address']:
-        risk_score += 35  # Direct IP access common in malware
-    if features['has_suspicious_keyword']:
-        risk_score += 30  # Download/crack keywords are major indicators
-    if features['is_shortened']:
-        risk_score += 25  # URL shorteners hide malicious destinations
-    # Important indicators (Medium Weight)
-    risk_score += min(features['path_keyword_count'] * 20, 40)  # Malware keywords in path
-    # Domain age indicators
-    if 0 <= features['domain_age_days'] < 30:
-        risk_score += 25  # Very new domains are highly suspicious
-    elif 30 <= features['domain_age_days'] < 90:
-        risk_score += 15  # New domains are suspicious
-    elif features['domain_age_days'] > 365*15:  # Very old domains can be compromised
-        risk_score += 10
     # Network indicators
     if 0 < features['dns_ttl'] < 300:
-        risk_score += 20  # Low TTL indicates fast-flux hosting
     if not features['ssl_valid'] and features['is_https']:
-        risk_score += 15  # Invalid SSL certificate
-    # Supporting indicators
-    risk_score += min(features['num_special_chars'] * 2, 10)
-    if features['url_length'] > 100:
-        risk_score += 10
     return min(risk_score, 100)
 # -------------------------------
-# LOGICAL TRUTH TABLE DECISION SYSTEM
 # -------------------------------
 def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk):
     """
-    Enhanced logical truth table for accurate classification
-    Truth Table Logic:
-    - Model predictions have primary weight
-    - Risk scores provide secondary validation and override capability
-    - Clear thresholds prevent misclassification
     """
-    # Define risk thresholds
-    HIGH_RISK = 70
-    MEDIUM_RISK = 45
-    LOW_RISK = 25
-    # CASE 1: Both models detect threats
-    if phishing_pred == "Phishing" and malware_pred == "malicious":
-        # Use risk scores to determine primary threat type
-        if malware_risk >= HIGH_RISK and malware_risk > phishing_risk + 15:
-            return "Malicious", "Both models detected threat - malware characteristics dominant"
-        elif phishing_risk >= HIGH_RISK and phishing_risk > malware_risk + 15:
-            return "Phishing", "Both models detected threat - phishing characteristics dominant"
-        else:
-            # When both risks are similar, use model confidence (default to phishing for similar scores)
-            return "Phishing", "Both models detected threat - mixed characteristics favor phishing"
-    # CASE 2: Only malware model detects threat
-    elif malware_pred == "malicious" and phishing_pred != "Phishing":
-        if malware_risk >= MEDIUM_RISK:
-            return "Malicious", "Malware model detection confirmed by risk indicators"
-        elif phishing_risk >= HIGH_RISK:
-            return "Phishing", "Malware detected but phishing risk indicators dominant"
         else:
-            return "Malicious", "Malware model detection (low confidence)"
-    # CASE 3: Only phishing model detects threat
-    elif phishing_pred == "Phishing" and malware_pred != "malicious":
-        if phishing_risk >= MEDIUM_RISK:
-            return "Phishing", "Phishing model detection confirmed by risk indicators"
-        elif malware_risk >= HIGH_RISK:
-            return "Malicious", "Phishing detected but malware risk indicators dominant"
         else:
-            return "Phishing", "Phishing model detection (low confidence)"
-    # CASE 4: Both models report benign - Risk-based override
-    else:
-        if malware_risk >= HIGH_RISK and phishing_risk >= HIGH_RISK:
-            # Both risks high - choose based on which is higher
-            if malware_risk > phishing_risk:
-                return "Malicious", "Models missed threat - high malware risk detected"
-            else:
-                return "Phishing", "Models missed threat - high phishing risk detected"
-        elif malware_risk >= HIGH_RISK:
-            return "Malicious", "Models reported benign but high malware risk indicators"
-        elif phishing_risk >= HIGH_RISK:
-            return "Phishing", "Models reported benign but high phishing risk indicators"
-        elif malware_risk >= MEDIUM_RISK or phishing_risk >= MEDIUM_RISK:
-            return "Suspicious", "Models reported benign but moderate risk indicators present"
         else:
-            return "Benign", "No threats detected by models or risk analysis"
 def analyze_url(url):
     try:
@@ -305,11 +301,11 @@ def analyze_url(url):
         phishing_pred = phishing_model.predict(phishing_df)[0]
         malware_pred = malware_model.predict(malware_df)[0]
-        # Calculate enhanced risk scores
         phishing_risk = calculate_phishing_risk(phishing_features)
         malware_risk = calculate_malware_risk(malware_features)
-        # Get final prediction using logical truth table
         final_result, decision_reason = get_final_prediction(
             phishing_pred, malware_pred, phishing_risk, malware_risk
         )
@@ -344,9 +340,9 @@ def analyze_url(url):
                 }
             },
             "risk_analysis": {
-                "phishing_risk_level": "High" if phishing_risk >= 70 else "Medium" if phishing_risk >= 45 else "Low",
-                "malware_risk_level": "High" if malware_risk >= 70 else "Medium" if malware_risk >= 45 else "Low",
-                "confidence": "High" if abs(phishing_risk - malware_risk) > 25 else "Medium"
             }
         }
@@ -366,36 +362,40 @@ def interface_fn(url):
     # Format output with enhanced information
     output = f"""
 🔍 URL Analysis Report: {result['url']}
-🎯 Final Verdict: {result['final_result']}
 📌 Decision Logic: {result['decision_reason']}
 🔮 Analysis Confidence: {result['risk_analysis']['confidence']}
-🔒 Phishing Analysis:
-  - Model Prediction: {result['phishing']['prediction']}
-  - Risk Score: {result['phishing']['risk_score']}/100 ({result['risk_analysis']['phishing_risk_level']} Risk)
-  - Key Indicators:
-    • IP Address: {result['phishing']['key_indicators']['has_ip']}
-    • Shortened URL: {result['phishing']['key_indicators']['is_shortened']}
-    • Suspicious TLD: {result['phishing']['key_indicators']['suspicious_tld']}
-    • Suspicious Words: {result['phishing']['key_indicators']['suspicious_words']}
-    • Path Keywords: {result['phishing']['key_indicators']['path_keywords']}
-    • No HTTPS: {result['phishing']['key_indicators']['no_https']}
-🛡️ Malware Analysis:
-  - Model Prediction: {result['malware']['prediction']}
-  - Risk Score: {result['malware']['risk_score']}/100 ({result['risk_analysis']['malware_risk_level']} Risk)
-  - Key Indicators:
-    • IP Address: {result['malware']['key_indicators']['has_ip']}
-    • Shortened URL: {result['malware']['key_indicators']['is_shortened']}
-    • Suspicious Keywords: {result['malware']['key_indicators']['suspicious_keywords']}
-    • New Domain (<30 days): {result['malware']['key_indicators']['new_domain']}
-    • Low DNS TTL: {result['malware']['key_indicators']['low_ttl']}
-    • Invalid SSL: {result['malware']['key_indicators']['invalid_ssl']}
-📊 Risk Comparison:
-  - Phishing Risk: {result['phishing']['risk_score']}/100
-  - Malware Risk: {result['malware']['risk_score']}/100
-  - Risk Difference: {abs(result['phishing']['risk_score'] - result['malware']['risk_score'])} points
     """
     return output
@@ -403,13 +403,15 @@ def interface_fn(url):
 demo = gr.Interface(
     fn=interface_fn,
     inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
-    outputs=gr.Textbox(label="Enhanced Threat Analysis Report", lines=25),
-    title="🛡️ Advanced URL Threat Analyzer with Logical Truth Table",
-    description="Enhanced multi-layered detection system with logical decision matrix for 100% accurate classification",
     examples=[
         ["https://www.paypal-login-secure.com/verify"],
         ["https://free-movie-downloads.xyz/get.exe"],
-        ["https://www.microsoft.com/en-us/"],
         ["http://192.168.1.100/install-update"],
         ["https://secure-apple-id-confirm.com"],
         ["https://bit.ly/malware-download"],

     return df
 # -------------------------------
+# REFINED RISK SCORING SYSTEM
 # -------------------------------
 def calculate_phishing_risk(features):
+    """Calculate refined phishing risk score with better thresholds"""
     risk_score = 0
+    # Critical indicators - only for clearly suspicious cases
     if features['has_ip_address']:
+        risk_score += 40  # Direct IP is major red flag
+    if features['is_shortened'] and features['has_suspicious_words']:
+        risk_score += 35  # Shortened URL with suspicious words
+    elif features['is_shortened']:
+        risk_score += 15  # Shortened URL alone is less suspicious
+    # Phishing-specific indicators
+    if features['has_suspicious_words'] and features['suspicious_tld']:
+        risk_score += 30  # Banking terms + suspicious TLD
+    elif features['has_suspicious_words']:
+        risk_score += 10  # Banking terms alone (could be legitimate)
+    # Domain and structure indicators
+    if features['suspicious_tld'] and features['num_hyphens'] > 2:
+        risk_score += 25  # Suspicious TLD with many hyphens
+    elif features['suspicious_tld']:
+        risk_score += 10  # Suspicious TLD alone
+    # Multiple suspicious indicators together
+    if features['path_keyword_count'] > 1 and features['query_keyword_count'] > 0:
+        risk_score += 20
+    elif features['path_keyword_count'] > 0:
+        risk_score += 8
+    # Length and special character penalties (reduced)
+    if features['url_length'] > 100:
+        risk_score += 8
+    if features['num_special_chars'] > 10:
+        risk_score += 5
+    if features['num_hyphens'] > 3:
+        risk_score += 5
     return min(risk_score, 100)
 def calculate_malware_risk(features):
+    """Calculate refined malware risk score with better thresholds"""
     risk_score = 0
+    # Critical indicators - only for clearly suspicious cases
     if features['has_ip_address']:
+        risk_score += 40  # Direct IP access
+    if features['has_suspicious_keyword'] and features['is_shortened']:
+        risk_score += 35  # Malware keywords + shortened URL
+    elif features['has_suspicious_keyword']:
+        risk_score += 15  # Malware keywords alone
+    # Path-based malware indicators
+    if features['path_keyword_count'] > 2:
+        risk_score += 30  # Multiple malware keywords in path
+    elif features['path_keyword_count'] > 0:
+        risk_score += 12
+    # Domain age indicators (refined)
+    if 0 <= features['domain_age_days'] < 7:
+        risk_score += 30  # Very new domains (1 week)
+    elif 7 <= features['domain_age_days'] < 30:
+        risk_score += 15  # New domains (1 month)
+    elif features['domain_age_days'] > 365*20:  # Very old compromised domains
+        risk_score += 8
     # Network indicators
     if 0 < features['dns_ttl'] < 300:
+        risk_score += 25  # Low TTL indicates fast-flux hosting
     if not features['ssl_valid'] and features['is_https']:
+        risk_score += 20  # Invalid SSL certificate
+    elif not features['is_https'] and features['has_suspicious_keyword']:
+        risk_score += 15  # No HTTPS with malware keywords
+    # Supporting indicators (reduced impact)
+    if features['url_length'] > 120:
+        risk_score += 8
+    if features['num_special_chars'] > 15:
+        risk_score += 5
     return min(risk_score, 100)
 # -------------------------------
+# SIMPLE RISK-BASED DECISION SYSTEM
 # -------------------------------
 def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk):
     """
+    Simple risk-based decision system:
+    1. Compare risk scores directly
+    2. Use higher risk score for final prediction
+    3. Add safety thresholds for benign classification
     """
+    # Safety thresholds
+    THREAT_THRESHOLD = 25  # Minimum score to consider as threat
+    HIGH_CONFIDENCE_THRESHOLD = 15  # Risk difference for high confidence
+    # Case 1: Both risks are very low - definitely benign
+    if phishing_risk < THREAT_THRESHOLD and malware_risk < THREAT_THRESHOLD:
+        return "Benign", f"Low risk scores (Phishing: {phishing_risk}, Malware: {malware_risk})"
+    # Case 2: One or both risks are above threshold
+    risk_difference = abs(phishing_risk - malware_risk)
+    if phishing_risk > malware_risk:
+        if phishing_risk >= THREAT_THRESHOLD:
+            confidence = "High" if risk_difference >= HIGH_CONFIDENCE_THRESHOLD else "Medium"
+            return "Phishing", f"Phishing risk higher ({phishing_risk} vs {malware_risk}) - {confidence} confidence"
         else:
+            return "Benign", f"Phishing risk slightly higher but below threshold ({phishing_risk})"
+    elif malware_risk > phishing_risk:
+        if malware_risk >= THREAT_THRESHOLD:
+            confidence = "High" if risk_difference >= HIGH_CONFIDENCE_THRESHOLD else "Medium"
+            return "Malicious", f"Malware risk higher ({malware_risk} vs {phishing_risk}) - {confidence} confidence"
         else:
+            return "Benign", f"Malware risk slightly higher but below threshold ({malware_risk})"
+    else:  # Equal risks
+        if phishing_risk >= THREAT_THRESHOLD:
+            return "Suspicious", f"Equal risk scores ({phishing_risk}) - requires manual review"
         else:
+            return "Benign", f"Equal low risk scores ({phishing_risk})"
 def analyze_url(url):
     try:
         phishing_pred = phishing_model.predict(phishing_df)[0]
         malware_pred = malware_model.predict(malware_df)[0]
+        # Calculate refined risk scores
         phishing_risk = calculate_phishing_risk(phishing_features)
         malware_risk = calculate_malware_risk(malware_features)
+        # Get final prediction using simple risk-based system
         final_result, decision_reason = get_final_prediction(
             phishing_pred, malware_pred, phishing_risk, malware_risk
         )
                 }
             },
             "risk_analysis": {
+                "phishing_risk_level": "High" if phishing_risk >= 60 else "Medium" if phishing_risk >= 25 else "Low",
+                "malware_risk_level": "High" if malware_risk >= 60 else "Medium" if malware_risk >= 25 else "Low",
+                "confidence": "High" if abs(phishing_risk - malware_risk) >= 15 else "Medium"
             }
         }
     # Format output with enhanced information
     output = f"""
 🔍 URL Analysis Report: {result['url']}
+🎯 FINAL VERDICT: {result['final_result']}
 📌 Decision Logic: {result['decision_reason']}
 🔮 Analysis Confidence: {result['risk_analysis']['confidence']}
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+🔒 PHISHING ANALYSIS:
+  • Model Prediction: {result['phishing']['prediction']}
+  • Risk Score: {result['phishing']['risk_score']}/100 ({result['risk_analysis']['phishing_risk_level']} Risk)
+  • Key Indicators:
+    - IP Address: {result['phishing']['key_indicators']['has_ip']}
+    - Shortened URL: {result['phishing']['key_indicators']['is_shortened']}
+    - Suspicious TLD: {result['phishing']['key_indicators']['suspicious_tld']}
+    - Suspicious Words: {result['phishing']['key_indicators']['suspicious_words']}
+    - Path Keywords: {result['phishing']['key_indicators']['path_keywords']}
+    - No HTTPS: {result['phishing']['key_indicators']['no_https']}
+🛡️ MALWARE ANALYSIS:
+  • Model Prediction: {result['malware']['prediction']}
+  • Risk Score: {result['malware']['risk_score']}/100 ({result['risk_analysis']['malware_risk_level']} Risk)
+  • Key Indicators:
+    - IP Address: {result['malware']['key_indicators']['has_ip']}
+    - Shortened URL: {result['malware']['key_indicators']['is_shortened']}
+    - Suspicious Keywords: {result['malware']['key_indicators']['suspicious_keywords']}
+    - New Domain (<30 days): {result['malware']['key_indicators']['new_domain']}
+    - Low DNS TTL: {result['malware']['key_indicators']['low_ttl']}
+    - Invalid SSL: {result['malware']['key_indicators']['invalid_ssl']}
+📊 RISK COMPARISON:
+  • Phishing Risk: {result['phishing']['risk_score']}/100
+  • Malware Risk: {result['malware']['risk_score']}/100
+  • Risk Difference: {abs(result['phishing']['risk_score'] - result['malware']['risk_score'])} points
+  • Winner: {"Phishing" if result['phishing']['risk_score'] > result['malware']['risk_score'] else "Malware" if result['malware']['risk_score'] > result['phishing']['risk_score'] else "Equal"}
     """
     return output
 demo = gr.Interface(
     fn=interface_fn,
     inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1),
+    outputs=gr.Textbox(label="🛡️ Simple Risk-Based Threat Analysis", lines=30),
+    title="🛡️ Fixed URL Threat Analyzer - Risk Score Based",
+    description="Simple and accurate threat detection based on risk scores. Higher risk score wins!",
     examples=[
+        ["https://www.google.com"],
+        ["https://www.facebook.com"],
+        ["https://www.microsoft.com/en-us/"],
         ["https://www.paypal-login-secure.com/verify"],
         ["https://free-movie-downloads.xyz/get.exe"],
         ["http://192.168.1.100/install-update"],
         ["https://secure-apple-id-confirm.com"],
         ["https://bit.ly/malware-download"],