Spaces:
Runtime error
Runtime error
File size: 5,084 Bytes
2cc7f91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import pandas as pd
from urllib.parse import urlparse
import re
# Load phishing URLs
phish_df = pd.read_csv('phishing_urls.csv')
print("=== PHISHING DATASET INFO ===")
print(f"Total phishing URLs: {len(phish_df)}")
print(f"Columns: {phish_df.columns.tolist()}\n")
# Assume URL column is 'url' (adjust if different)
url_column = 'url' # Change to your actual column name
print("=== PHISHING TYPE ANALYSIS (from raw URLs) ===\n")
# Function to analyze URL
def analyze_phishing_type(url):
"""Determine phishing type from raw URL"""
url = str(url).lower()
parsed = urlparse(url)
domain = parsed.netloc
path = parsed.path
result = {
'url': url,
'domain': domain,
'path': path,
'type': 'unknown'
}
# 1. IP-based phishing
ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
if re.search(ip_pattern, domain):
result['type'] = 'ip_based'
return result
# 2. Brand impersonation (check for known brands)
brands = [
'paypal', 'amazon', 'apple', 'google', 'microsoft', 'facebook',
'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'bank',
'chase', 'wellsfargo', 'citi', 'americanexpress', 'visa', 'mastercard',
'dhl', 'fedex', 'ups', 'usps', 'alibaba', 'walmart', 'adobe',
'dropbox', 'office365', 'outlook', 'yahoo', 'aol', 'whatsapp'
]
# Check if brand in URL but not in main domain
url_full = domain + path
sld = domain.split('.')[-2] if len(domain.split('.')) >= 2 else domain
brand_found = None
for brand in brands:
if brand in url_full:
brand_found = brand
# Check if brand is actually the main domain
if brand == sld or sld.startswith(brand):
# Legitimate brand usage
break
else:
# Brand impersonation
result['type'] = 'brand_impersonation'
result['brand'] = brand
return result
# 3. Phishing keywords (generic phishing)
phishing_keywords = [
'login', 'signin', 'verify', 'account', 'update', 'secure',
'confirm', 'suspended', 'locked', 'alert', 'urgent', 'validate',
'banking', 'credential', 'auth', 'password', 'restore', 'recover'
]
keyword_count = sum(1 for kw in phishing_keywords if kw in url_full)
if keyword_count >= 2:
result['type'] = 'generic_phishing'
result['keyword_count'] = keyword_count
return result
# 4. Suspicious TLD
suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
if any(tld in domain for tld in suspicious_tlds):
result['type'] = 'suspicious_tld'
return result
# 5. Compromised site (trusted TLD + phishing in path)
trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov']
if any(tld in domain for tld in trusted_tlds):
if any(kw in path for kw in phishing_keywords):
result['type'] = 'compromised_site'
return result
# Default
result['type'] = 'other'
return result
# Analyze all URLs
print("Analyzing URLs... (this may take a minute)")
results = []
for url in phish_df[url_column]:
results.append(analyze_phishing_type(url))
results_df = pd.DataFrame(results)
# Count types
type_counts = results_df['type'].value_counts()
print("\n=== PHISHING TYPE DISTRIBUTION ===")
for ptype, count in type_counts.items():
percentage = (count / len(phish_df)) * 100
print(f"{ptype:20s}: {count:6d} / {len(phish_df)} ({percentage:5.1f}%)")
# Domain characteristics
print("\n=== DOMAIN CHARACTERISTICS ===")
# Domain lengths
domain_lengths = results_df['domain'].apply(len)
print(f"Avg domain length: {domain_lengths.mean():.1f} chars")
print(f"Median domain length: {domain_lengths.median():.1f} chars")
# Number of domain parts
num_parts = results_df['domain'].apply(lambda d: len(d.split('.')))
print(f"Avg domain parts: {num_parts.mean():.1f}")
print(f"Median domain parts: {num_parts.median():.1f}")
# Number of subdomains
num_subdomains = num_parts - 2 # Subtract SLD and TLD
print(f"Avg subdomains: {num_subdomains.mean():.1f}")
# Path characteristics
print("\n=== PATH CHARACTERISTICS ===")
path_lengths = results_df['path'].apply(len)
print(f"Avg path length: {path_lengths.mean():.1f} chars")
print(f"URLs with paths: {(path_lengths > 1).sum()} / {len(phish_df)} ({(path_lengths > 1).sum()/len(phish_df)*100:.1f}%)")
# Show examples
print("\n=== EXAMPLES BY TYPE ===")
for ptype in type_counts.index[:5]:
examples = results_df[results_df['type'] == ptype]['url'].head(3)
print(f"\n{ptype.upper()}:")
for i, ex in enumerate(examples, 1):
print(f" {i}. {ex[:100]}...")
# Save detailed results
results_df.to_csv('phishing_type_analysis.csv', index=False)
print("\n✅ Detailed results saved to: phishing_type_analysis.csv") |