rb1337's picture
Upload 50 files
2cc7f91 verified
import pandas as pd
from urllib.parse import urlparse
import re
# Load phishing URLs
phish_df = pd.read_csv('phishing_urls.csv')
print("=== PHISHING DATASET INFO ===")
print(f"Total phishing URLs: {len(phish_df)}")
print(f"Columns: {phish_df.columns.tolist()}\n")
# Assume URL column is 'url' (adjust if different)
url_column = 'url' # Change to your actual column name
print("=== PHISHING TYPE ANALYSIS (from raw URLs) ===\n")
# Function to analyze URL
def analyze_phishing_type(url):
"""Determine phishing type from raw URL"""
url = str(url).lower()
parsed = urlparse(url)
domain = parsed.netloc
path = parsed.path
result = {
'url': url,
'domain': domain,
'path': path,
'type': 'unknown'
}
# 1. IP-based phishing
ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
if re.search(ip_pattern, domain):
result['type'] = 'ip_based'
return result
# 2. Brand impersonation (check for known brands)
brands = [
'paypal', 'amazon', 'apple', 'google', 'microsoft', 'facebook',
'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'bank',
'chase', 'wellsfargo', 'citi', 'americanexpress', 'visa', 'mastercard',
'dhl', 'fedex', 'ups', 'usps', 'alibaba', 'walmart', 'adobe',
'dropbox', 'office365', 'outlook', 'yahoo', 'aol', 'whatsapp'
]
# Check if brand in URL but not in main domain
url_full = domain + path
sld = domain.split('.')[-2] if len(domain.split('.')) >= 2 else domain
brand_found = None
for brand in brands:
if brand in url_full:
brand_found = brand
# Check if brand is actually the main domain
if brand == sld or sld.startswith(brand):
# Legitimate brand usage
break
else:
# Brand impersonation
result['type'] = 'brand_impersonation'
result['brand'] = brand
return result
# 3. Phishing keywords (generic phishing)
phishing_keywords = [
'login', 'signin', 'verify', 'account', 'update', 'secure',
'confirm', 'suspended', 'locked', 'alert', 'urgent', 'validate',
'banking', 'credential', 'auth', 'password', 'restore', 'recover'
]
keyword_count = sum(1 for kw in phishing_keywords if kw in url_full)
if keyword_count >= 2:
result['type'] = 'generic_phishing'
result['keyword_count'] = keyword_count
return result
# 4. Suspicious TLD
suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
if any(tld in domain for tld in suspicious_tlds):
result['type'] = 'suspicious_tld'
return result
# 5. Compromised site (trusted TLD + phishing in path)
trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov']
if any(tld in domain for tld in trusted_tlds):
if any(kw in path for kw in phishing_keywords):
result['type'] = 'compromised_site'
return result
# Default
result['type'] = 'other'
return result
# Analyze all URLs
print("Analyzing URLs... (this may take a minute)")
results = []
for url in phish_df[url_column]:
results.append(analyze_phishing_type(url))
results_df = pd.DataFrame(results)
# Count types
type_counts = results_df['type'].value_counts()
print("\n=== PHISHING TYPE DISTRIBUTION ===")
for ptype, count in type_counts.items():
percentage = (count / len(phish_df)) * 100
print(f"{ptype:20s}: {count:6d} / {len(phish_df)} ({percentage:5.1f}%)")
# Domain characteristics
print("\n=== DOMAIN CHARACTERISTICS ===")
# Domain lengths
domain_lengths = results_df['domain'].apply(len)
print(f"Avg domain length: {domain_lengths.mean():.1f} chars")
print(f"Median domain length: {domain_lengths.median():.1f} chars")
# Number of domain parts
num_parts = results_df['domain'].apply(lambda d: len(d.split('.')))
print(f"Avg domain parts: {num_parts.mean():.1f}")
print(f"Median domain parts: {num_parts.median():.1f}")
# Number of subdomains
num_subdomains = num_parts - 2 # Subtract SLD and TLD
print(f"Avg subdomains: {num_subdomains.mean():.1f}")
# Path characteristics
print("\n=== PATH CHARACTERISTICS ===")
path_lengths = results_df['path'].apply(len)
print(f"Avg path length: {path_lengths.mean():.1f} chars")
print(f"URLs with paths: {(path_lengths > 1).sum()} / {len(phish_df)} ({(path_lengths > 1).sum()/len(phish_df)*100:.1f}%)")
# Show examples
print("\n=== EXAMPLES BY TYPE ===")
for ptype in type_counts.index[:5]:
examples = results_df[results_df['type'] == ptype]['url'].head(3)
print(f"\n{ptype.upper()}:")
for i, ex in enumerate(examples, 1):
print(f" {i}. {ex[:100]}...")
# Save detailed results
results_df.to_csv('phishing_type_analysis.csv', index=False)
print("\n✅ Detailed results saved to: phishing_type_analysis.csv")