import pandas as pd from urllib.parse import urlparse import re # Load phishing URLs phish_df = pd.read_csv('phishing_urls.csv') print("=== PHISHING DATASET INFO ===") print(f"Total phishing URLs: {len(phish_df)}") print(f"Columns: {phish_df.columns.tolist()}\n") # Assume URL column is 'url' (adjust if different) url_column = 'url' # Change to your actual column name print("=== PHISHING TYPE ANALYSIS (from raw URLs) ===\n") # Function to analyze URL def analyze_phishing_type(url): """Determine phishing type from raw URL""" url = str(url).lower() parsed = urlparse(url) domain = parsed.netloc path = parsed.path result = { 'url': url, 'domain': domain, 'path': path, 'type': 'unknown' } # 1. IP-based phishing ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' if re.search(ip_pattern, domain): result['type'] = 'ip_based' return result # 2. Brand impersonation (check for known brands) brands = [ 'paypal', 'amazon', 'apple', 'google', 'microsoft', 'facebook', 'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'bank', 'chase', 'wellsfargo', 'citi', 'americanexpress', 'visa', 'mastercard', 'dhl', 'fedex', 'ups', 'usps', 'alibaba', 'walmart', 'adobe', 'dropbox', 'office365', 'outlook', 'yahoo', 'aol', 'whatsapp' ] # Check if brand in URL but not in main domain url_full = domain + path sld = domain.split('.')[-2] if len(domain.split('.')) >= 2 else domain brand_found = None for brand in brands: if brand in url_full: brand_found = brand # Check if brand is actually the main domain if brand == sld or sld.startswith(brand): # Legitimate brand usage break else: # Brand impersonation result['type'] = 'brand_impersonation' result['brand'] = brand return result # 3. Phishing keywords (generic phishing) phishing_keywords = [ 'login', 'signin', 'verify', 'account', 'update', 'secure', 'confirm', 'suspended', 'locked', 'alert', 'urgent', 'validate', 'banking', 'credential', 'auth', 'password', 'restore', 'recover' ] keyword_count = sum(1 for kw in phishing_keywords if kw in url_full) if keyword_count >= 2: result['type'] = 'generic_phishing' result['keyword_count'] = keyword_count return result # 4. Suspicious TLD suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click'] if any(tld in domain for tld in suspicious_tlds): result['type'] = 'suspicious_tld' return result # 5. Compromised site (trusted TLD + phishing in path) trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov'] if any(tld in domain for tld in trusted_tlds): if any(kw in path for kw in phishing_keywords): result['type'] = 'compromised_site' return result # Default result['type'] = 'other' return result # Analyze all URLs print("Analyzing URLs... (this may take a minute)") results = [] for url in phish_df[url_column]: results.append(analyze_phishing_type(url)) results_df = pd.DataFrame(results) # Count types type_counts = results_df['type'].value_counts() print("\n=== PHISHING TYPE DISTRIBUTION ===") for ptype, count in type_counts.items(): percentage = (count / len(phish_df)) * 100 print(f"{ptype:20s}: {count:6d} / {len(phish_df)} ({percentage:5.1f}%)") # Domain characteristics print("\n=== DOMAIN CHARACTERISTICS ===") # Domain lengths domain_lengths = results_df['domain'].apply(len) print(f"Avg domain length: {domain_lengths.mean():.1f} chars") print(f"Median domain length: {domain_lengths.median():.1f} chars") # Number of domain parts num_parts = results_df['domain'].apply(lambda d: len(d.split('.'))) print(f"Avg domain parts: {num_parts.mean():.1f}") print(f"Median domain parts: {num_parts.median():.1f}") # Number of subdomains num_subdomains = num_parts - 2 # Subtract SLD and TLD print(f"Avg subdomains: {num_subdomains.mean():.1f}") # Path characteristics print("\n=== PATH CHARACTERISTICS ===") path_lengths = results_df['path'].apply(len) print(f"Avg path length: {path_lengths.mean():.1f} chars") print(f"URLs with paths: {(path_lengths > 1).sum()} / {len(phish_df)} ({(path_lengths > 1).sum()/len(phish_df)*100:.1f}%)") # Show examples print("\n=== EXAMPLES BY TYPE ===") for ptype in type_counts.index[:5]: examples = results_df[results_df['type'] == ptype]['url'].head(3) print(f"\n{ptype.upper()}:") for i, ex in enumerate(examples, 1): print(f" {i}. {ex[:100]}...") # Save detailed results results_df.to_csv('phishing_type_analysis.csv', index=False) print("\n✅ Detailed results saved to: phishing_type_analysis.csv")