File size: 10,187 Bytes
214209a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | """
URL Phishing Analyzer
Rule-based phishing detection with optional PhishTank API integration.
Computes a risk score (0-100) and confidence (0-1) with detailed reasons.
"""
import re
import asyncio
from datetime import datetime
from typing import Dict, List, Optional, Any
import logging
logger = logging.getLogger(__name__)
# Known suspicious TLDs frequently used in phishing
SUSPICIOUS_TLDS = {
'.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.club',
'.work', '.date', '.racing', '.download', '.win', '.bid',
'.stream', '.trade', '.webcam', '.loan', '.party', '.click',
'.link', '.info', '.zip', '.mov', '.php'
}
# URL shortener domains
URL_SHORTENERS = {
'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd',
'buff.ly', 'j.mp', 'rb.gy', 'shorturl.at', 'tiny.cc', 'cutt.ly',
's.id', 'v.gd', 'clck.ru', 'qr.ae'
}
# Suspicious keywords commonly found in phishing URLs
SUSPICIOUS_KEYWORDS = [
'login', 'verify', 'account', 'secure', 'update', 'banking',
'signin', 'confirm', 'password', 'credential', 'authenticate',
'wallet', 'suspend', 'restrict', 'unlock', 'alert', 'notification',
'paypal', 'appleid', 'microsoft', 'amazon', 'netflix', 'facebook',
'instagram', 'wellsfargo', 'chase', 'citi'
]
# Well-known legitimate domains (whitelist)
TRUSTED_DOMAINS = {
'google.com', 'www.google.com', 'youtube.com', 'www.youtube.com',
'facebook.com', 'www.facebook.com', 'twitter.com', 'x.com',
'github.com', 'www.github.com', 'stackoverflow.com',
'microsoft.com', 'www.microsoft.com', 'apple.com', 'www.apple.com',
'amazon.com', 'www.amazon.com', 'wikipedia.org', 'en.wikipedia.org',
'linkedin.com', 'www.linkedin.com', 'reddit.com', 'www.reddit.com',
'instagram.com', 'www.instagram.com', 'netflix.com', 'www.netflix.com',
'whatsapp.com', 'web.whatsapp.com', 'mail.google.com',
'outlook.com', 'outlook.live.com', 'drive.google.com',
'docs.google.com', 'localhost'
}
# Homoglyph characters (look-alikes used to spoof domains)
HOMOGLYPH_PATTERN = re.compile(r'[Π°-ΡΠ-Π―ΡΠ\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]')
def analyze_url(url_string: str) -> Dict[str, Any]:
"""
Analyze a URL for phishing indicators
Args:
url_string: The URL to analyze
Returns:
Dictionary with safe, riskScore, confidence, reasons, category, timestamp
"""
reasons: List[str] = []
risk_score = 0
# Basic validation
if not url_string or not isinstance(url_string, str):
return {
"url": url_string,
"safe": False,
"riskScore": 100,
"confidence": 1.0,
"reasons": ["Invalid or empty URL"],
"category": "invalid",
"timestamp": datetime.utcnow().isoformat()
}
# Parse URL
try:
# Handle URLs without protocol
url_with_protocol = url_string if url_string.startswith('http') else f'http://{url_string}'
from urllib.parse import urlparse
parsed = urlparse(url_with_protocol)
except Exception:
return {
"url": url_string,
"safe": False,
"riskScore": 80,
"confidence": 0.9,
"reasons": ["Malformed URL that cannot be parsed"],
"category": "malformed",
"timestamp": datetime.utcnow().isoformat()
}
hostname = parsed.hostname.lower() if parsed.hostname else ""
full_url = url_string.lower()
path = parsed.path.lower()
# Whitelist check
if hostname in TRUSTED_DOMAINS:
return {
"url": url_string,
"safe": True,
"riskScore": 0,
"confidence": 0.95,
"reasons": [],
"category": "trusted",
"timestamp": datetime.utcnow().isoformat()
}
# Check 1: IP address instead of domain
ipv4_regex = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$')
if ipv4_regex.match(hostname):
risk_score += 30
reasons.append("URL uses an IP address instead of a domain name β common in phishing")
# Check 2: Suspicious TLD
tld = '.' + (hostname.split('.')[-1] if hostname else "")
if tld in SUSPICIOUS_TLDS:
risk_score += 20
reasons.append(f'Uses suspicious top-level domain "{tld}" β frequently abused for phishing')
# Check 3: Excessive subdomains
subdomain_count = len(hostname.split('.')) - 2 if hostname else 0
if subdomain_count > 2:
risk_score += 15
reasons.append(f"Excessive subdomains ({subdomain_count + 2} levels) β used to disguise real domain")
# Check 4: URL shortener
if hostname in URL_SHORTENERS:
risk_score += 25
reasons.append("URL shortener detected β hides the actual destination, often used in phishing")
# Check 5: Homoglyph / IDN characters
if HOMOGLYPH_PATTERN.search(url_string):
risk_score += 35
reasons.append("Contains homoglyph/Cyrillic characters β used to impersonate legitimate domains")
# Check 6: Very long URL
if len(url_string) > 100:
risk_score += 10
reasons.append(f"Unusually long URL ({len(url_string)} characters) β may be hiding malicious content")
# Check 7: Suspicious keywords in URL
found_keywords = [kw for kw in SUSPICIOUS_KEYWORDS if kw in full_url]
if found_keywords:
keyword_score = min(len(found_keywords) * 8, 25)
risk_score += keyword_score
reasons.append(f"Contains suspicious keywords: {', '.join(found_keywords)} β common in phishing URLs")
# Check 8: @ symbol in URL
if '@' in url_string:
risk_score += 25
reasons.append('Contains "@" symbol β can redirect to a different domain than displayed')
# Check 9: Data URI
if url_string.lower().startswith('data:'):
risk_score += 40
reasons.append("Data URI detected β can embed malicious content without a server")
# Check 10: HTTPS check
if parsed.scheme == 'http' and 'localhost' not in hostname:
risk_score += 40
reasons.append("Uses HTTP instead of HTTPS β connection is not encrypted and insecure")
# Check 11: Port number in URL
if parsed.port and parsed.port not in [80, 443]:
risk_score += 10
reasons.append(f"Non-standard port (:{parsed.port}) β unusual for legitimate websites")
# Check 12: Double extension in path
double_ext_regex = re.compile(r'\.\w{2,4}\.\w{2,4}$')
if double_ext_regex.search(path):
risk_score += 20
reasons.append("Double file extension detected in path β common trick to disguise malware")
# Check 13: Encoded characters abuse
encoded_count = len(re.findall(r'%[0-9a-fA-F]{2}', url_string))
if encoded_count > 5:
risk_score += 15
reasons.append(f"Heavy URL encoding ({encoded_count} encoded chars) β may be obfuscating content")
# Check 14: Hyphen abuse in domain
hyphens = hostname.count('-') if hostname else 0
if hyphens > 3:
risk_score += 15
reasons.append(f"Excessive hyphens in domain ({hyphens}) β common in phishing domains")
# Cap score at 100
risk_score = min(risk_score, 100)
# Compute confidence based on number of signals
confidence = min(0.3 + len(reasons) * 0.12, 1.0)
# Determine category
category = "safe"
if risk_score >= 70:
category = "high_risk"
elif risk_score >= 40:
category = "medium_risk"
elif risk_score >= 20:
category = "low_risk"
return {
"url": url_string,
"safe": risk_score < 40,
"riskScore": risk_score,
"confidence": round(confidence, 2),
"reasons": reasons,
"category": category,
"timestamp": datetime.utcnow().isoformat()
}
async def check_phishtank(url_string: str, api_key: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""
Check URL against PhishTank database
Args:
url_string: URL to check
api_key: PhishTank API key (optional)
Returns:
PhishTank result or None
"""
try:
import aiohttp
params = {
'url': url_string,
'format': 'json'
}
if api_key:
params['app_key'] = api_key
async with aiohttp.ClientSession() as session:
async with session.post(
'http://checkurl.staging.phishtank.com/checkurl/',
data=params,
headers={'User-Agent': 'phishtank/wingineers'},
timeout=aiohttp.ClientTimeout(total=5)
) as response:
if response.status != 200:
return None
data = await response.json()
return {
"inDatabase": data.get('results', {}).get('in_database') in [True, 'true'],
"isPhish": data.get('results', {}).get('valid') in [True, 'y'],
"phishDetailUrl": data.get('results', {}).get('phish_detail_page')
}
except Exception as e:
logger.warning(f"PhishTank lookup failed: {e}")
return None
async def full_analysis(url_string: str, phish_tank_api_key: Optional[str] = None) -> Dict[str, Any]:
"""
Full analysis: rule-based + PhishTank
"""
result = analyze_url(url_string)
# Run PhishTank check if API key is available
phish_tank_result = await check_phishtank(url_string, phish_tank_api_key)
if phish_tank_result:
result["phishTank"] = phish_tank_result
if phish_tank_result["isPhish"]:
result["safe"] = False
result["riskScore"] = max(result["riskScore"], 90)
result["confidence"] = max(result["confidence"], 0.95)
result["reasons"].append("β οΈ Confirmed phishing URL in PhishTank database")
result["category"] = "high_risk"
elif phish_tank_result["inDatabase"] and not phish_tank_result["isPhish"]:
# In database but not confirmed as phish β lower risk slightly
if result["riskScore"] > 20:
result["riskScore"] = max(result["riskScore"] - 10, 0)
return result
|