Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from collections import Counter | |
| from urllib.parse import urlparse | |
| # Load data | |
| df = pd.read_csv('data/features/url_features.csv') | |
| phish_df = df[df['label'] == 1].copy() # Assuming 1 = phishing | |
| legit_df = df[df['label'] == 0].copy() # Assuming 0 = legitimate | |
| print("=== FREE PLATFORM DETECTION ANALYSIS ===\n") | |
| # 1. Check detection rate | |
| print(f"Total phishing: {len(phish_df)}") | |
| print(f"Phishing on free platforms: {phish_df['is_free_platform'].sum()} ({phish_df['is_free_platform'].mean()*100:.1f}%)") | |
| print(f"\nTotal legitimate: {len(legit_df)}") | |
| print(f"Legitimate on free platforms: {legit_df['is_free_platform'].sum()} ({legit_df['is_free_platform'].mean()*100:.1f}%)") | |
| # 2. Load original URLs | |
| urls_df = pd.read_csv('data/processed/clean_dataset.csv') | |
| phish_urls = urls_df[urls_df['label'] == 1]['url'].tolist() # Adjust column names | |
| legit_urls = urls_df[urls_df['label'] == 0]['url'].tolist() | |
| # 3. Extract domains from phishing URLs | |
| def extract_domain(url): | |
| try: | |
| parsed = urlparse(url if url.startswith('http') else 'http://' + url) | |
| return parsed.netloc.lower() | |
| except: | |
| return '' | |
| phish_domains = [extract_domain(url) for url in phish_urls] | |
| # 4. Find common domain patterns | |
| print("\n=== TOP 50 PHISHING DOMAINS (by frequency) ===") | |
| domain_counts = Counter(phish_domains) | |
| for domain, count in domain_counts.most_common(50): | |
| print(f"{domain:50s}: {count:5d}") | |
| # 5. Find common suffixes (platforms) | |
| print("\n=== COMMON DOMAIN SUFFIXES (platforms) ===") | |
| suffixes = [] | |
| for domain in phish_domains: | |
| parts = domain.split('.') | |
| if len(parts) >= 2: | |
| suffix = '.'.join(parts[-2:]) # Last 2 parts (e.g., weebly.com) | |
| suffixes.append(suffix) | |
| suffix_counts = Counter(suffixes) | |
| print("\nTop 30 suffixes:") | |
| for suffix, count in suffix_counts.most_common(30): | |
| print(f"{suffix:30s}: {count:5d} ({count/len(phish_domains)*100:.1f}%)") | |