Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from urllib.parse import urlparse | |
| import re | |
| # Load phishing URLs | |
| phish_df = pd.read_csv('phishing_urls.csv') | |
| print("=== PHISHING DATASET INFO ===") | |
| print(f"Total phishing URLs: {len(phish_df)}") | |
| print(f"Columns: {phish_df.columns.tolist()}\n") | |
| # Assume URL column is 'url' (adjust if different) | |
| url_column = 'url' # Change to your actual column name | |
| print("=== PHISHING TYPE ANALYSIS (from raw URLs) ===\n") | |
| # Function to analyze URL | |
| def analyze_phishing_type(url): | |
| """Determine phishing type from raw URL""" | |
| url = str(url).lower() | |
| parsed = urlparse(url) | |
| domain = parsed.netloc | |
| path = parsed.path | |
| result = { | |
| 'url': url, | |
| 'domain': domain, | |
| 'path': path, | |
| 'type': 'unknown' | |
| } | |
| # 1. IP-based phishing | |
| ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' | |
| if re.search(ip_pattern, domain): | |
| result['type'] = 'ip_based' | |
| return result | |
| # 2. Brand impersonation (check for known brands) | |
| brands = [ | |
| 'paypal', 'amazon', 'apple', 'google', 'microsoft', 'facebook', | |
| 'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'bank', | |
| 'chase', 'wellsfargo', 'citi', 'americanexpress', 'visa', 'mastercard', | |
| 'dhl', 'fedex', 'ups', 'usps', 'alibaba', 'walmart', 'adobe', | |
| 'dropbox', 'office365', 'outlook', 'yahoo', 'aol', 'whatsapp' | |
| ] | |
| # Check if brand in URL but not in main domain | |
| url_full = domain + path | |
| sld = domain.split('.')[-2] if len(domain.split('.')) >= 2 else domain | |
| brand_found = None | |
| for brand in brands: | |
| if brand in url_full: | |
| brand_found = brand | |
| # Check if brand is actually the main domain | |
| if brand == sld or sld.startswith(brand): | |
| # Legitimate brand usage | |
| break | |
| else: | |
| # Brand impersonation | |
| result['type'] = 'brand_impersonation' | |
| result['brand'] = brand | |
| return result | |
| # 3. Phishing keywords (generic phishing) | |
| phishing_keywords = [ | |
| 'login', 'signin', 'verify', 'account', 'update', 'secure', | |
| 'confirm', 'suspended', 'locked', 'alert', 'urgent', 'validate', | |
| 'banking', 'credential', 'auth', 'password', 'restore', 'recover' | |
| ] | |
| keyword_count = sum(1 for kw in phishing_keywords if kw in url_full) | |
| if keyword_count >= 2: | |
| result['type'] = 'generic_phishing' | |
| result['keyword_count'] = keyword_count | |
| return result | |
| # 4. Suspicious TLD | |
| suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click'] | |
| if any(tld in domain for tld in suspicious_tlds): | |
| result['type'] = 'suspicious_tld' | |
| return result | |
| # 5. Compromised site (trusted TLD + phishing in path) | |
| trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov'] | |
| if any(tld in domain for tld in trusted_tlds): | |
| if any(kw in path for kw in phishing_keywords): | |
| result['type'] = 'compromised_site' | |
| return result | |
| # Default | |
| result['type'] = 'other' | |
| return result | |
| # Analyze all URLs | |
| print("Analyzing URLs... (this may take a minute)") | |
| results = [] | |
| for url in phish_df[url_column]: | |
| results.append(analyze_phishing_type(url)) | |
| results_df = pd.DataFrame(results) | |
| # Count types | |
| type_counts = results_df['type'].value_counts() | |
| print("\n=== PHISHING TYPE DISTRIBUTION ===") | |
| for ptype, count in type_counts.items(): | |
| percentage = (count / len(phish_df)) * 100 | |
| print(f"{ptype:20s}: {count:6d} / {len(phish_df)} ({percentage:5.1f}%)") | |
| # Domain characteristics | |
| print("\n=== DOMAIN CHARACTERISTICS ===") | |
| # Domain lengths | |
| domain_lengths = results_df['domain'].apply(len) | |
| print(f"Avg domain length: {domain_lengths.mean():.1f} chars") | |
| print(f"Median domain length: {domain_lengths.median():.1f} chars") | |
| # Number of domain parts | |
| num_parts = results_df['domain'].apply(lambda d: len(d.split('.'))) | |
| print(f"Avg domain parts: {num_parts.mean():.1f}") | |
| print(f"Median domain parts: {num_parts.median():.1f}") | |
| # Number of subdomains | |
| num_subdomains = num_parts - 2 # Subtract SLD and TLD | |
| print(f"Avg subdomains: {num_subdomains.mean():.1f}") | |
| # Path characteristics | |
| print("\n=== PATH CHARACTERISTICS ===") | |
| path_lengths = results_df['path'].apply(len) | |
| print(f"Avg path length: {path_lengths.mean():.1f} chars") | |
| print(f"URLs with paths: {(path_lengths > 1).sum()} / {len(phish_df)} ({(path_lengths > 1).sum()/len(phish_df)*100:.1f}%)") | |
| # Show examples | |
| print("\n=== EXAMPLES BY TYPE ===") | |
| for ptype in type_counts.index[:5]: | |
| examples = results_df[results_df['type'] == ptype]['url'].head(3) | |
| print(f"\n{ptype.upper()}:") | |
| for i, ex in enumerate(examples, 1): | |
| print(f" {i}. {ex[:100]}...") | |
| # Save detailed results | |
| results_df.to_csv('phishing_type_analysis.csv', index=False) | |
| print("\n✅ Detailed results saved to: phishing_type_analysis.csv") |