|
|
import gradio as gr |
|
|
import joblib |
|
|
import pandas as pd |
|
|
import re |
|
|
import string |
|
|
import socket |
|
|
import ssl |
|
|
import whois |
|
|
import dns.resolver |
|
|
from urllib.parse import urlparse |
|
|
from datetime import datetime |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
phishing_model = joblib.load("phishing_stack.pkl") |
|
|
malware_model = joblib.load("new_malware_stack.pkl") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_phishing_features(url): |
|
|
parsed = urlparse(url) |
|
|
hostname = parsed.hostname if parsed.hostname else "" |
|
|
tld = hostname.split('.')[-1] if '.' in hostname else "" |
|
|
path = parsed.path.lower() |
|
|
query = parsed.query.lower() |
|
|
|
|
|
|
|
|
phishing_keywords = [ |
|
|
"login", "signin", "verify", "account", "update", "security", |
|
|
"banking", "paypal", "ebay", "amazon", "apple", "microsoft", |
|
|
"confirm", "validate", "password", "creditcard", "ssn", "phishing" |
|
|
] |
|
|
|
|
|
|
|
|
suspicious_tlds = [ |
|
|
"xyz", "top", "icu", "ga", "tk", "cf", "ml", "gq", "cc", "pw", |
|
|
"club", "info", "stream", "download", "work", "online" |
|
|
] |
|
|
|
|
|
return { |
|
|
"url_length": len(url), |
|
|
"hostname_length": len(hostname), |
|
|
"num_dots": url.count('.'), |
|
|
"num_hyphens": url.count('-'), |
|
|
"num_digits": sum(char.isdigit() for char in url), |
|
|
"num_special_chars": len(re.findall(r"[^\w\s./]", url)), |
|
|
"has_ip_address": 1 if re.match(r"\d+\.\d+\.\d+\.\d+", hostname) else 0, |
|
|
"has_https": 1 if parsed.scheme == "https" else 0, |
|
|
"has_suspicious_words": 1 if any(word in url.lower() for word in phishing_keywords) else 0, |
|
|
"is_shortened": 1 if any(short in url for short in |
|
|
["bit.ly", "tinyurl", "goo.gl", "t.co", "ow.ly", "is.gd", "shorte.st"]) else 0, |
|
|
"suspicious_tld": 1 if tld in suspicious_tlds else 0, |
|
|
"path_keyword_count": sum(1 for word in phishing_keywords if word in path), |
|
|
"query_keyword_count": sum(1 for word in phishing_keywords if word in query), |
|
|
"tld": tld |
|
|
} |
|
|
|
|
|
def extract_malware_features(url): |
|
|
parsed = urlparse(url) |
|
|
hostname = parsed.hostname or "" |
|
|
scheme = parsed.scheme |
|
|
path = parsed.path.lower() |
|
|
|
|
|
|
|
|
malware_keywords = [ |
|
|
"download", "install", "free", "crack", "keygen", "serial", "torrent", |
|
|
"nulled", "patch", "loader", "activator", "setup", "executable", "malware", |
|
|
"virus", "trojan", "spyware", "ransomware", "adware", "botnet" |
|
|
] |
|
|
|
|
|
|
|
|
url_length = len(url) |
|
|
hostname_length = len(hostname) |
|
|
num_dots = url.count('.') |
|
|
num_hyphens = url.count('-') |
|
|
num_digits = len(re.findall(r'\d', url)) |
|
|
num_specials = len(re.findall(r"[^\w\s./]", url)) |
|
|
has_suspicious_keyword = any(k in url.lower() for k in malware_keywords) |
|
|
has_ip = bool(re.match(r'https?://(\d{1,3}\.){3}\d{1,3}', url)) |
|
|
is_https = scheme == 'https' |
|
|
is_shortened = any(s in url for s in |
|
|
['bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'shorte.st']) |
|
|
tld = hostname.split('.')[-1] if '.' in hostname else '' |
|
|
path_keyword_count = sum(1 for word in malware_keywords if word in path) |
|
|
|
|
|
|
|
|
try: |
|
|
ip_address = socket.gethostbyname(hostname) |
|
|
except: |
|
|
ip_address = None |
|
|
|
|
|
|
|
|
try: |
|
|
w = whois.whois(url) |
|
|
domain_age = (datetime.now() - w.creation_date[0]).days if w.creation_date else -1 |
|
|
domain_expiry = (w.expiration_date[0] - datetime.now()).days if w.expiration_date else -1 |
|
|
except: |
|
|
domain_age = domain_expiry = -1 |
|
|
|
|
|
|
|
|
try: |
|
|
answers = dns.resolver.resolve(hostname, 'A') |
|
|
ttl = answers.rrset.ttl |
|
|
except: |
|
|
ttl = -1 |
|
|
|
|
|
|
|
|
ssl_issuer = "Unknown" |
|
|
ssl_valid = False |
|
|
if is_https and hostname: |
|
|
try: |
|
|
ctx = ssl.create_default_context() |
|
|
with ctx.wrap_socket(socket.socket(), server_hostname=hostname) as s: |
|
|
s.settimeout(3) |
|
|
s.connect((hostname, 443)) |
|
|
cert = s.getpeercert() |
|
|
issuer = dict(x[0] for x in cert['issuer'])['organizationName'] |
|
|
ssl_issuer = issuer if issuer else "Unknown" |
|
|
ssl_valid = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z') > datetime.now() |
|
|
except: |
|
|
pass |
|
|
|
|
|
return { |
|
|
"url_length": url_length, |
|
|
"hostname_length": hostname_length, |
|
|
"num_dots": num_dots, |
|
|
"num_hyphens": num_hyphens, |
|
|
"num_digits": num_digits, |
|
|
"num_special_chars": num_specials, |
|
|
"has_suspicious_keyword": int(has_suspicious_keyword), |
|
|
"path_keyword_count": path_keyword_count, |
|
|
"has_ip_address": int(has_ip), |
|
|
"is_https": int(is_https), |
|
|
"is_shortened": int(is_shortened), |
|
|
"tld": tld, |
|
|
"domain_age_days": domain_age, |
|
|
"domain_expiry_days": domain_expiry, |
|
|
"dns_ttl": ttl, |
|
|
"ssl_issuer": ssl_issuer, |
|
|
"ssl_valid": int(ssl_valid) |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_phishing_input(url): |
|
|
features = extract_phishing_features(url) |
|
|
df = pd.DataFrame([features]) |
|
|
df = pd.get_dummies(df, columns=["tld"], prefix="tld") |
|
|
df = df.reindex(columns=phishing_model.feature_names_in_, fill_value=0) |
|
|
return df |
|
|
|
|
|
def prepare_malware_input(url): |
|
|
features = extract_malware_features(url) |
|
|
df = pd.DataFrame([features]) |
|
|
df = pd.get_dummies(df, columns=["tld", "ssl_issuer"], prefix=["tld", "ssl_issuer"]) |
|
|
df = df.reindex(columns=malware_model.feature_names_in_, fill_value=0) |
|
|
return df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_phishing_risk(features): |
|
|
"""Calculate refined phishing risk score with better thresholds""" |
|
|
risk_score = 0 |
|
|
|
|
|
|
|
|
if features['has_ip_address']: |
|
|
risk_score += 40 |
|
|
if features['is_shortened'] and features['has_suspicious_words']: |
|
|
risk_score += 35 |
|
|
elif features['is_shortened']: |
|
|
risk_score += 15 |
|
|
|
|
|
|
|
|
if features['has_suspicious_words'] and features['suspicious_tld']: |
|
|
risk_score += 30 |
|
|
elif features['has_suspicious_words']: |
|
|
risk_score += 10 |
|
|
|
|
|
|
|
|
if features['suspicious_tld'] and features['num_hyphens'] > 2: |
|
|
risk_score += 25 |
|
|
elif features['suspicious_tld']: |
|
|
risk_score += 10 |
|
|
|
|
|
|
|
|
if features['path_keyword_count'] > 1 and features['query_keyword_count'] > 0: |
|
|
risk_score += 20 |
|
|
elif features['path_keyword_count'] > 0: |
|
|
risk_score += 8 |
|
|
|
|
|
|
|
|
if features['url_length'] > 100: |
|
|
risk_score += 8 |
|
|
if features['num_special_chars'] > 10: |
|
|
risk_score += 5 |
|
|
if features['num_hyphens'] > 3: |
|
|
risk_score += 5 |
|
|
|
|
|
return min(risk_score, 100) |
|
|
|
|
|
def calculate_malware_risk(features): |
|
|
"""Calculate refined malware risk score with better thresholds""" |
|
|
risk_score = 0 |
|
|
|
|
|
|
|
|
if features['has_ip_address']: |
|
|
risk_score += 40 |
|
|
if features['has_suspicious_keyword'] and features['is_shortened']: |
|
|
risk_score += 35 |
|
|
elif features['has_suspicious_keyword']: |
|
|
risk_score += 15 |
|
|
|
|
|
|
|
|
if features['path_keyword_count'] > 2: |
|
|
risk_score += 30 |
|
|
elif features['path_keyword_count'] > 0: |
|
|
risk_score += 12 |
|
|
|
|
|
|
|
|
if 0 <= features['domain_age_days'] < 7: |
|
|
risk_score += 30 |
|
|
elif 7 <= features['domain_age_days'] < 30: |
|
|
risk_score += 15 |
|
|
elif features['domain_age_days'] > 365*20: |
|
|
risk_score += 8 |
|
|
|
|
|
|
|
|
|
|
|
if 0 < features['dns_ttl'] < 300 and (features['has_suspicious_keyword'] or features['has_ip_address'] or features['is_shortened']): |
|
|
risk_score += 20 |
|
|
|
|
|
if not features['ssl_valid'] and features['is_https']: |
|
|
risk_score += 20 |
|
|
elif not features['is_https'] and features['has_suspicious_keyword']: |
|
|
risk_score += 15 |
|
|
|
|
|
|
|
|
if features['url_length'] > 120: |
|
|
risk_score += 8 |
|
|
if features['num_special_chars'] > 15: |
|
|
risk_score += 5 |
|
|
|
|
|
return min(risk_score, 100) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_final_prediction(phishing_pred, malware_pred, phishing_risk, malware_risk): |
|
|
""" |
|
|
Enhanced decision system: |
|
|
1. Prioritize model predictions first |
|
|
2. Use risk scores for confidence and tie-breaking |
|
|
3. Whitelist protection for trusted domains |
|
|
""" |
|
|
|
|
|
|
|
|
trusted_domains = [ |
|
|
'google.com', 'www.google.com', 'facebook.com', 'www.facebook.com', |
|
|
'microsoft.com', 'www.microsoft.com', 'apple.com', 'www.apple.com', |
|
|
'amazon.com', 'www.amazon.com', 'youtube.com', 'www.youtube.com', |
|
|
'twitter.com', 'www.twitter.com', 'linkedin.com', 'www.linkedin.com', |
|
|
'github.com', 'www.github.com', 'stackoverflow.com', 'www.stackoverflow.com' |
|
|
] |
|
|
|
|
|
|
|
|
from urllib.parse import urlparse |
|
|
try: |
|
|
parsed_url = urlparse(url if 'url' in locals() else "") |
|
|
domain = parsed_url.netloc.lower() |
|
|
if domain in trusted_domains: |
|
|
return "Benign", f"Whitelisted trusted domain: {domain}" |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
RISK_BOOST_THRESHOLD = 15 |
|
|
|
|
|
|
|
|
if phishing_pred == "Phishing" and malware_pred == "malicious": |
|
|
if phishing_risk > malware_risk: |
|
|
return "Phishing", f"Both models detected threat - phishing characteristics stronger (Risk: {phishing_risk} vs {malware_risk})" |
|
|
else: |
|
|
return "Malicious", f"Both models detected threat - malware characteristics stronger (Risk: {malware_risk} vs {phishing_risk})" |
|
|
|
|
|
|
|
|
elif phishing_pred == "Phishing" and malware_pred != "malicious": |
|
|
if phishing_risk >= RISK_BOOST_THRESHOLD or phishing_risk > malware_risk: |
|
|
return "Phishing", f"Phishing model detected threat with supporting risk indicators (Risk: {phishing_risk})" |
|
|
else: |
|
|
return "Phishing", f"Phishing model detected threat (Risk score: {phishing_risk})" |
|
|
|
|
|
|
|
|
elif malware_pred == "malicious" and phishing_pred != "Phishing": |
|
|
if malware_risk >= RISK_BOOST_THRESHOLD or malware_risk > phishing_risk: |
|
|
return "Malicious", f"Malware model detected threat with supporting risk indicators (Risk: {malware_risk})" |
|
|
else: |
|
|
return "Malicious", f"Malware model detected threat (Risk score: {malware_risk})" |
|
|
|
|
|
|
|
|
else: |
|
|
HIGH_RISK_THRESHOLD = 40 |
|
|
MEDIUM_RISK_THRESHOLD = 25 |
|
|
|
|
|
if phishing_risk >= HIGH_RISK_THRESHOLD and malware_risk >= HIGH_RISK_THRESHOLD: |
|
|
if phishing_risk > malware_risk: |
|
|
return "Phishing", f"Models missed but high phishing risk detected ({phishing_risk})" |
|
|
else: |
|
|
return "Malicious", f"Models missed but high malware risk detected ({malware_risk})" |
|
|
elif phishing_risk >= HIGH_RISK_THRESHOLD: |
|
|
return "Phishing", f"Models reported benign but high phishing risk indicators ({phishing_risk})" |
|
|
elif malware_risk >= HIGH_RISK_THRESHOLD: |
|
|
return "Malicious", f"Models reported benign but high malware risk indicators ({malware_risk})" |
|
|
elif phishing_risk >= MEDIUM_RISK_THRESHOLD or malware_risk >= MEDIUM_RISK_THRESHOLD: |
|
|
return "Suspicious", f"Models reported benign but moderate risk present (P:{phishing_risk}, M:{malware_risk})" |
|
|
else: |
|
|
return "Benign", f"Models and risk analysis confirm safe (P:{phishing_risk}, M:{malware_risk})" |
|
|
|
|
|
def analyze_url(url): |
|
|
try: |
|
|
|
|
|
phishing_features = extract_phishing_features(url) |
|
|
malware_features = extract_malware_features(url) |
|
|
|
|
|
|
|
|
phishing_df = prepare_phishing_input(url) |
|
|
malware_df = prepare_malware_input(url) |
|
|
|
|
|
|
|
|
phishing_pred = phishing_model.predict(phishing_df)[0] |
|
|
malware_pred = malware_model.predict(malware_df)[0] |
|
|
|
|
|
|
|
|
phishing_risk = calculate_phishing_risk(phishing_features) |
|
|
malware_risk = calculate_malware_risk(malware_features) |
|
|
|
|
|
|
|
|
final_result, decision_reason = get_final_prediction( |
|
|
phishing_pred, malware_pred, phishing_risk, malware_risk |
|
|
) |
|
|
|
|
|
|
|
|
report = { |
|
|
"url": url, |
|
|
"final_result": final_result, |
|
|
"decision_reason": decision_reason, |
|
|
"phishing": { |
|
|
"prediction": phishing_pred, |
|
|
"risk_score": phishing_risk, |
|
|
"key_indicators": { |
|
|
"has_ip": bool(phishing_features['has_ip_address']), |
|
|
"is_shortened": bool(phishing_features['is_shortened']), |
|
|
"suspicious_tld": bool(phishing_features['suspicious_tld']), |
|
|
"suspicious_words": bool(phishing_features['has_suspicious_words']), |
|
|
"path_keywords": phishing_features['path_keyword_count'], |
|
|
"no_https": not bool(phishing_features['has_https']) |
|
|
} |
|
|
}, |
|
|
"malware": { |
|
|
"prediction": malware_pred, |
|
|
"risk_score": malware_risk, |
|
|
"key_indicators": { |
|
|
"has_ip": bool(malware_features['has_ip_address']), |
|
|
"is_shortened": bool(malware_features['is_shortened']), |
|
|
"suspicious_keywords": bool(malware_features['has_suspicious_keyword']), |
|
|
"new_domain": 0 <= malware_features['domain_age_days'] < 30, |
|
|
"low_ttl": 0 < malware_features['dns_ttl'] < 300, |
|
|
"invalid_ssl": not bool(malware_features['ssl_valid']) and bool(malware_features['is_https']) |
|
|
} |
|
|
}, |
|
|
"risk_analysis": { |
|
|
"phishing_risk_level": "High" if phishing_risk >= 60 else "Medium" if phishing_risk >= 25 else "Low", |
|
|
"malware_risk_level": "High" if malware_risk >= 60 else "Medium" if malware_risk >= 25 else "Low", |
|
|
"confidence": "High" if abs(phishing_risk - malware_risk) >= 15 else "Medium" |
|
|
} |
|
|
} |
|
|
|
|
|
return report |
|
|
|
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def interface_fn(url): |
|
|
|
|
|
result = analyze_url(url) |
|
|
|
|
|
|
|
|
if "error" in result: |
|
|
return f"❌ Error: {result['error']}" |
|
|
|
|
|
|
|
|
|
|
|
phishing_pred = result['phishing']['prediction'] |
|
|
malware_pred = result['malware']['prediction'] |
|
|
final_verdict = result['final_result'] |
|
|
|
|
|
|
|
|
output = f""" |
|
|
📋 Phishing Model Prediction: {phishing_pred} |
|
|
📋 Malware Model Prediction: {malware_pred} |
|
|
|
|
|
🎯 FINAL VERDICT: {final_verdict} |
|
|
""" |
|
|
|
|
|
return final_verdict |
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=interface_fn, |
|
|
inputs=gr.Textbox(label="Enter URL", placeholder="https://example.com", lines=1), |
|
|
outputs=gr.Textbox(label="🛡️ Simple Model Comparison & Final Verdict", lines=10), |
|
|
title="🛡️ URL Threat Analyzer - Model Comparison", |
|
|
description="Predicts using both phishing and malware models and shows final verdict", |
|
|
examples=[ |
|
|
["https://www.google.com"], |
|
|
["https://www.paypal-login-secure.com/verify"], |
|
|
["https://free-movie-downloads.xyz/get.exe"], |
|
|
["http://192.168.1.100/install-update"], |
|
|
["https://banking-update.tk/signin"] |
|
|
], |
|
|
theme="soft" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
share=True, |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
show_error=True, |
|
|
show_api=True, |
|
|
quiet=False |
|
|
) |