| """ |
| URL Phishing Analyzer |
| Rule-based phishing detection with optional PhishTank API integration. |
| Computes a risk score (0-100) and confidence (0-1) with detailed reasons. |
| """ |
|
|
| import re |
| import asyncio |
| from datetime import datetime |
| from typing import Dict, List, Optional, Any |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| SUSPICIOUS_TLDS = { |
| '.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.club', |
| '.work', '.date', '.racing', '.download', '.win', '.bid', |
| '.stream', '.trade', '.webcam', '.loan', '.party', '.click', |
| '.link', '.info', '.zip', '.mov', '.php' |
| } |
|
|
| |
| URL_SHORTENERS = { |
| 'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly', 'is.gd', |
| 'buff.ly', 'j.mp', 'rb.gy', 'shorturl.at', 'tiny.cc', 'cutt.ly', |
| 's.id', 'v.gd', 'clck.ru', 'qr.ae' |
| } |
|
|
| |
| SUSPICIOUS_KEYWORDS = [ |
| 'login', 'verify', 'account', 'secure', 'update', 'banking', |
| 'signin', 'confirm', 'password', 'credential', 'authenticate', |
| 'wallet', 'suspend', 'restrict', 'unlock', 'alert', 'notification', |
| 'paypal', 'appleid', 'microsoft', 'amazon', 'netflix', 'facebook', |
| 'instagram', 'wellsfargo', 'chase', 'citi' |
| ] |
|
|
| |
| TRUSTED_DOMAINS = { |
| 'google.com', 'www.google.com', 'youtube.com', 'www.youtube.com', |
| 'facebook.com', 'www.facebook.com', 'twitter.com', 'x.com', |
| 'github.com', 'www.github.com', 'stackoverflow.com', |
| 'microsoft.com', 'www.microsoft.com', 'apple.com', 'www.apple.com', |
| 'amazon.com', 'www.amazon.com', 'wikipedia.org', 'en.wikipedia.org', |
| 'linkedin.com', 'www.linkedin.com', 'reddit.com', 'www.reddit.com', |
| 'instagram.com', 'www.instagram.com', 'netflix.com', 'www.netflix.com', |
| 'whatsapp.com', 'web.whatsapp.com', 'mail.google.com', |
| 'outlook.com', 'outlook.live.com', 'drive.google.com', |
| 'docs.google.com', 'localhost' |
| } |
|
|
| |
| HOMOGLYPH_PATTERN = re.compile(r'[а-яА-ЯёЁ\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]') |
|
|
|
|
| def analyze_url(url_string: str) -> Dict[str, Any]: |
| """ |
| Analyze a URL for phishing indicators |
| |
| Args: |
| url_string: The URL to analyze |
| |
| Returns: |
| Dictionary with safe, riskScore, confidence, reasons, category, timestamp |
| """ |
| reasons: List[str] = [] |
| risk_score = 0 |
|
|
| |
| if not url_string or not isinstance(url_string, str): |
| return { |
| "url": url_string, |
| "safe": False, |
| "riskScore": 100, |
| "confidence": 1.0, |
| "reasons": ["Invalid or empty URL"], |
| "category": "invalid", |
| "timestamp": datetime.utcnow().isoformat() |
| } |
|
|
| |
| try: |
| |
| url_with_protocol = url_string if url_string.startswith('http') else f'http://{url_string}' |
| from urllib.parse import urlparse |
| parsed = urlparse(url_with_protocol) |
| except Exception: |
| return { |
| "url": url_string, |
| "safe": False, |
| "riskScore": 80, |
| "confidence": 0.9, |
| "reasons": ["Malformed URL that cannot be parsed"], |
| "category": "malformed", |
| "timestamp": datetime.utcnow().isoformat() |
| } |
|
|
| hostname = parsed.hostname.lower() if parsed.hostname else "" |
| full_url = url_string.lower() |
| path = parsed.path.lower() |
|
|
| |
| if hostname in TRUSTED_DOMAINS: |
| return { |
| "url": url_string, |
| "safe": True, |
| "riskScore": 0, |
| "confidence": 0.95, |
| "reasons": [], |
| "category": "trusted", |
| "timestamp": datetime.utcnow().isoformat() |
| } |
|
|
| |
| ipv4_regex = re.compile(r'^(\d{1,3}\.){3}\d{1,3}$') |
| if ipv4_regex.match(hostname): |
| risk_score += 30 |
| reasons.append("URL uses an IP address instead of a domain name — common in phishing") |
|
|
| |
| tld = '.' + (hostname.split('.')[-1] if hostname else "") |
| if tld in SUSPICIOUS_TLDS: |
| risk_score += 20 |
| reasons.append(f'Uses suspicious top-level domain "{tld}" — frequently abused for phishing') |
|
|
| |
| subdomain_count = len(hostname.split('.')) - 2 if hostname else 0 |
| if subdomain_count > 2: |
| risk_score += 15 |
| reasons.append(f"Excessive subdomains ({subdomain_count + 2} levels) — used to disguise real domain") |
|
|
| |
| if hostname in URL_SHORTENERS: |
| risk_score += 25 |
| reasons.append("URL shortener detected — hides the actual destination, often used in phishing") |
|
|
| |
| if HOMOGLYPH_PATTERN.search(url_string): |
| risk_score += 35 |
| reasons.append("Contains homoglyph/Cyrillic characters — used to impersonate legitimate domains") |
|
|
| |
| if len(url_string) > 100: |
| risk_score += 10 |
| reasons.append(f"Unusually long URL ({len(url_string)} characters) — may be hiding malicious content") |
|
|
| |
| found_keywords = [kw for kw in SUSPICIOUS_KEYWORDS if kw in full_url] |
| if found_keywords: |
| keyword_score = min(len(found_keywords) * 8, 25) |
| risk_score += keyword_score |
| reasons.append(f"Contains suspicious keywords: {', '.join(found_keywords)} — common in phishing URLs") |
|
|
| |
| if '@' in url_string: |
| risk_score += 25 |
| reasons.append('Contains "@" symbol — can redirect to a different domain than displayed') |
|
|
| |
| if url_string.lower().startswith('data:'): |
| risk_score += 40 |
| reasons.append("Data URI detected — can embed malicious content without a server") |
|
|
| |
| if parsed.scheme == 'http' and 'localhost' not in hostname: |
| risk_score += 40 |
| reasons.append("Uses HTTP instead of HTTPS — connection is not encrypted and insecure") |
|
|
| |
| if parsed.port and parsed.port not in [80, 443]: |
| risk_score += 10 |
| reasons.append(f"Non-standard port (:{parsed.port}) — unusual for legitimate websites") |
|
|
| |
| double_ext_regex = re.compile(r'\.\w{2,4}\.\w{2,4}$') |
| if double_ext_regex.search(path): |
| risk_score += 20 |
| reasons.append("Double file extension detected in path — common trick to disguise malware") |
|
|
| |
| encoded_count = len(re.findall(r'%[0-9a-fA-F]{2}', url_string)) |
| if encoded_count > 5: |
| risk_score += 15 |
| reasons.append(f"Heavy URL encoding ({encoded_count} encoded chars) — may be obfuscating content") |
|
|
| |
| hyphens = hostname.count('-') if hostname else 0 |
| if hyphens > 3: |
| risk_score += 15 |
| reasons.append(f"Excessive hyphens in domain ({hyphens}) — common in phishing domains") |
|
|
| |
| risk_score = min(risk_score, 100) |
|
|
| |
| confidence = min(0.3 + len(reasons) * 0.12, 1.0) |
|
|
| |
| category = "safe" |
| if risk_score >= 70: |
| category = "high_risk" |
| elif risk_score >= 40: |
| category = "medium_risk" |
| elif risk_score >= 20: |
| category = "low_risk" |
|
|
| return { |
| "url": url_string, |
| "safe": risk_score < 40, |
| "riskScore": risk_score, |
| "confidence": round(confidence, 2), |
| "reasons": reasons, |
| "category": category, |
| "timestamp": datetime.utcnow().isoformat() |
| } |
|
|
|
|
| async def check_phishtank(url_string: str, api_key: Optional[str] = None) -> Optional[Dict[str, Any]]: |
| """ |
| Check URL against PhishTank database |
| |
| Args: |
| url_string: URL to check |
| api_key: PhishTank API key (optional) |
| |
| Returns: |
| PhishTank result or None |
| """ |
| try: |
| import aiohttp |
|
|
| params = { |
| 'url': url_string, |
| 'format': 'json' |
| } |
| if api_key: |
| params['app_key'] = api_key |
|
|
| async with aiohttp.ClientSession() as session: |
| async with session.post( |
| 'http://checkurl.staging.phishtank.com/checkurl/', |
| data=params, |
| headers={'User-Agent': 'phishtank/wingineers'}, |
| timeout=aiohttp.ClientTimeout(total=5) |
| ) as response: |
| if response.status != 200: |
| return None |
|
|
| data = await response.json() |
| return { |
| "inDatabase": data.get('results', {}).get('in_database') in [True, 'true'], |
| "isPhish": data.get('results', {}).get('valid') in [True, 'y'], |
| "phishDetailUrl": data.get('results', {}).get('phish_detail_page') |
| } |
| except Exception as e: |
| logger.warning(f"PhishTank lookup failed: {e}") |
| return None |
|
|
|
|
| async def full_analysis(url_string: str, phish_tank_api_key: Optional[str] = None) -> Dict[str, Any]: |
| """ |
| Full analysis: rule-based + PhishTank |
| """ |
| result = analyze_url(url_string) |
|
|
| |
| phish_tank_result = await check_phishtank(url_string, phish_tank_api_key) |
|
|
| if phish_tank_result: |
| result["phishTank"] = phish_tank_result |
|
|
| if phish_tank_result["isPhish"]: |
| result["safe"] = False |
| result["riskScore"] = max(result["riskScore"], 90) |
| result["confidence"] = max(result["confidence"], 0.95) |
| result["reasons"].append("⚠️ Confirmed phishing URL in PhishTank database") |
| result["category"] = "high_risk" |
| elif phish_tank_result["inDatabase"] and not phish_tank_result["isPhish"]: |
| |
| if result["riskScore"] > 20: |
| result["riskScore"] = max(result["riskScore"] - 10, 0) |
|
|
| return result |
|
|