Spaces:

quantum-drive
/

malware-phishing-detection

Sleeping

App Files Files Community

quantum-drive commited on May 30, 2025

Commit

e5d904c

verified ·

1 Parent(s): e4a0cc7

Upload app.py

Browse files

Files changed (1) hide show

app.py +302 -0

app.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import gradio as gr
+import joblib
+import pandas as pd
+import re
+import string
+import socket
+import ssl
+import whois
+import dns.resolver
+from urllib.parse import urlparse
+from datetime import datetime
+# -------------------------------
+# Load Trained Models
+# -------------------------------
+phishing_model = joblib.load("phishing_stack.pkl")
+malware_model = joblib.load("new_malware_stack.pkl")
+# -------------------------------
+# Enhanced Feature Extraction
+# -------------------------------
+def extract_phishing_features(url):
+    parsed = urlparse(url)
+    hostname = parsed.hostname if parsed.hostname else ""
+    tld = hostname.split('.')[-1] if '.' in hostname else ""
+    return {
+        "url_length": len(url),
+        "hostname_length": len(hostname),
+        "num_dots": url.count('.'),
+        "num_hyphens": url.count('-'),
+        "num_digits": sum(char.isdigit() for char in url),
+        "num_special_chars": len(re.findall(r"[^\w]", url)) - url.count('/'),
+        "has_ip_address": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", hostname) else 0,
+        "has_https": 1 if parsed.scheme == "https" else 0,
+        "has_suspicious_words": 1 if any(word in url.lower() for word in
+                                        ["login", "secure", "update", "verify", "account", "banking", "paypal"]) else 0,
+        "is_shortened": 1 if any(short in url for short in
+                                 ["bit.ly", "tinyurl", "goo.gl", "t.co", "ow.ly", "is.gd"]) else 0,
+        "tld": tld
+    }
+def extract_malware_features(url):
+    parsed = urlparse(url)
+    hostname = parsed.hostname or ""
+    scheme = parsed.scheme
+    # Basic URL features
+    url_length = len(url)
+    hostname_length = len(hostname)
+    num_dots = url.count('.')
+    num_hyphens = url.count('-')
+    num_digits = len(re.findall(r'\d', url))
+    special_chars = set(string.punctuation) - {'/'}
+    num_specials = sum(1 for c in url if c in special_chars)
+    has_suspicious_keyword = any(k in url.lower() for k in
+                                 ['login', 'secure', 'verify', 'update', 'download', 'install', 'free'])
+    has_ip = bool(re.match(r'https?://(\d{1,3}\.){3}\d{1,3}', url))
+    is_https = scheme == 'https'
+    is_shortened = any(s in url for s in
+                      ['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'shorte.st'])
+    tld = hostname.split('.')[-1] if '.' in hostname else ''
+    # Network features
+    try:
+        ip_address = socket.gethostbyname(hostname)
+    except:
+        ip_address = None
+    # WHOIS features
+    try:
+        w = whois.whois(url)
+        domain_age = (datetime.now() - w.creation_date[0]).days if w.creation_date else -1
+        domain_expiry = (w.expiration_date[0] - datetime.now()).days if w.expiration_date else -1
+    except:
+        domain_age = domain_expiry = -1
+    # DNS features
+    try:
+        answers = dns.resolver.resolve(hostname, 'A')
+        ttl = answers.rrset.ttl
+    except:
+        ttl = -1
+    # SSL features
+    ssl_issuer = "Unknown"
+    ssl_valid = False
+    if is_https and hostname:
+        try:
+            ctx = ssl.create_default_context()
+            with ctx.wrap_socket(socket.socket(), server_hostname=hostname) as s:
+                s.settimeout(3)
+                s.connect((hostname, 443))
+                cert = s.getpeercert()
+                issuer = dict(x[0] for x in cert['issuer'])['organizationName']
+                ssl_issuer = issuer if issuer else "Unknown"
+                ssl_valid = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z') > datetime.now()
+        except:
+            pass
+    return {
+        "url_length": url_length,
+        "hostname_length": hostname_length,
+        "num_dots": num_dots,
+        "num_hyphens": num_hyphens,
+        "num_digits": num_digits,
+        "num_special_chars": num_specials,
+        "has_suspicious_keyword": int(has_suspicious_keyword),
+        "has_ip_address": int(has_ip),
+        "is_https": int(is_https),
+        "is_shortened": int(is_shortened),
+        "tld": tld,
+        "domain_age_days": domain_age,
+        "domain_expiry_days": domain_expiry,
+        "dns_ttl": ttl,
+        "ssl_issuer": ssl_issuer,
+        "ssl_valid": int(ssl_valid)
+    }
+# -------------------------------
+# Prepare Model Inputs
+# -------------------------------
+def prepare_phishing_input(url):
+    features = extract_phishing_features(url)
+    df = pd.DataFrame([features])
+    df = pd.get_dummies(df, columns=["tld"], prefix="tld")
+    df = df.reindex(columns=phishing_model.feature_names_in_, fill_value=0)
+    return df
+def prepare_malware_input(url):
+    features = extract_malware_features(url)
+    df = pd.DataFrame([features])
+    df = pd.get_dummies(df, columns=["tld", "ssl_issuer"], prefix=["tld", "ssl_issuer"])
+    df = df.reindex(columns=malware_model.feature_names_in_, fill_value=0)
+    return df
+# -------------------------------
+# PREDICTION NORMALIZATION
+# -------------------------------
+def normalize_prediction(prediction):
+    """Normalize different prediction formats to standard format"""
+    pred_str = str(prediction).lower().strip()
+    # Handle different formats that might come from models
+    if pred_str in ['phishing', '1', 'malicious', 'threat', 'bad']:
+        return 'threat'
+    elif pred_str in ['benign', '0', 'safe', 'good', 'legitimate']:
+        return 'benign'
+    else:
+        return 'unknown'
+# -------------------------------
+# IMPROVED TRUTH TABLE DECISION LOGIC
+# -------------------------------
+def analyze_url(url):
+    try:
+        # Get model predictions
+        phishing_pred_raw = phishing_model.predict(prepare_phishing_input(url))[0]
+        malware_pred_raw = malware_model.predict(prepare_malware_input(url))[0]
+        # Normalize predictions
+        phishing_pred = normalize_prediction(phishing_pred_raw)
+        malware_pred = normalize_prediction(malware_pred_raw)
+        # IMPROVED TRUTH TABLE DECISION LOGIC
+        # Priority: Malware > Phishing > Benign (with benign bias for legitimate sites)
+        if malware_pred == "threat" and phishing_pred == "threat":
+            final_result = "Malicious"
+            reason = "Both models detected threats - High risk malware and phishing"
+        elif malware_pred == "threat" and phishing_pred == "benign":
+            final_result = "Malicious"
+            reason = "Malware model detected malicious content"
+        elif malware_pred == "benign" and phishing_pred == "benign":
+            final_result = "Benign"
+            reason = "Both models confirm URL is safe"
+        elif malware_pred == "benign" and phishing_pred == "threat":
+            # Check if URL looks legitimate (has common TLDs and reasonable structure)
+            parsed = urlparse(url)
+            hostname = parsed.hostname or ""
+            legitimate_tlds = ['.com', '.org', '.net', '.edu', '.gov', '.co.uk', '.ca', '.au']
+            is_legitimate_structure = any(tld in hostname for tld in legitimate_tlds) and len(hostname.split('.')) >= 2
+            if is_legitimate_structure and not any(suspicious in url.lower() for suspicious in
+                                                 ['login', 'signin', 'verify', 'update', 'secure', 'account', 'banking']):
+                final_result = "Benign"
+                reason = "Legitimate website structure detected, overriding phishing model false positive"
+            else:
+                final_result = "Phishing"
+                reason = "Phishing model detected phishing attempt"
+        else:
+            # Handle unknown/uncertain cases
+            final_result = "Suspicious"
+            reason = f"Inconclusive results - Malware: {malware_pred}, Phishing: {phishing_pred}"
+        return {
+            "url": url,
+            "final_result": final_result,
+            "decision_reason": reason,
+            "phishing_model_prediction": str(phishing_pred_raw),
+            "malware_model_prediction": str(malware_pred_raw),
+            "normalized_phishing": phishing_pred,
+            "normalized_malware": malware_pred
+        }
+    except Exception as e:
+        return {"error": str(e)}
+# -------------------------------
+# GRADIO INTERFACE
+# -------------------------------
+def interface_fn(url):
+    if not url.strip():
+        return "❌ Please enter a valid URL"
+    # Add protocol if missing
+    if not url.startswith(('http://', 'https://')):
+        url = 'https://' + url
+    result = analyze_url(url)
+    if "error" in result:
+        return f"❌ Error analyzing URL: {result['error']}"
+    # Format output for better readability
+    output = f"""
+🔍 Analysis Report for: {result['url']}
+⚠️ Final Verdict: {result['final_result']}
+📌 Decision Reason: {result['decision_reason']}
+🔒 Phishing Model: {result['phishing_model_prediction']} (normalized: {result['normalized_phishing']})
+🛡️ Malware Model: {result['malware_model_prediction']} (normalized: {result['normalized_malware']})
+{'='*50}
+"""
+    # Add appropriate emoji and color coding
+    if result['final_result'] == "Benign":
+        output = "✅ SAFE " + output
+    elif result['final_result'] in ["Phishing", "Malicious"]:
+        output = "❌ DANGEROUS " + output
+    else:
+        output = "⚠️ SUSPICIOUS " + output
+    return output
+# -------------------------------
+# GRADIO APP
+# -------------------------------
+demo = gr.Interface(
+    fn=interface_fn,
+    inputs=gr.Text(
+        label="Enter URL to Analyze",
+        placeholder="https://example.com or just example.com",
+        lines=1
+    ),
+    outputs=gr.Textbox(
+        label="🛡️ Threat Analysis Report",
+        lines=10,
+        max_lines=15
+    ),
+    title="🛡️ AI-Powered URL Threat Analyzer",
+    description="""
+    **Advanced URL Security Scanner**
+    This tool uses dual AI models to detect:
+    • 🎣 Phishing attacks
+    • 🦠 Malware threats
+    • 🔒 Overall URL safety
+    Enter any URL to get a comprehensive security analysis.
+    """,
+    examples=[
+        ["https://www.google.com"],
+        ["https://www.paypal.com/signin"],
+        ["https://www.bbc.com/news"],
+        ["bit.ly/suspicious-link"],
+        ["http://malware-site.ru/download.exe"]
+    ],
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        max-width: 800px;
+        margin: auto;
+    }
+    """
+)
+if __name__ == "__main__":
+    demo.launch(
+    share=True,
+    server_name="0.0.0.0",
+    server_port=7860,
+    show_error=True
+)