Phishing-Detection-System / scripts /feature_extraction /url /url_features_diagnostic.py
rb1337's picture
Upload 50 files
2cc7f91 verified
import pandas as pd
from collections import Counter
from urllib.parse import urlparse
# Load data
df = pd.read_csv('data/features/url_features.csv')
phish_df = df[df['label'] == 1].copy() # Assuming 1 = phishing
legit_df = df[df['label'] == 0].copy() # Assuming 0 = legitimate
print("=== FREE PLATFORM DETECTION ANALYSIS ===\n")
# 1. Check detection rate
print(f"Total phishing: {len(phish_df)}")
print(f"Phishing on free platforms: {phish_df['is_free_platform'].sum()} ({phish_df['is_free_platform'].mean()*100:.1f}%)")
print(f"\nTotal legitimate: {len(legit_df)}")
print(f"Legitimate on free platforms: {legit_df['is_free_platform'].sum()} ({legit_df['is_free_platform'].mean()*100:.1f}%)")
# 2. Load original URLs
urls_df = pd.read_csv('data/processed/clean_dataset.csv')
phish_urls = urls_df[urls_df['label'] == 1]['url'].tolist() # Adjust column names
legit_urls = urls_df[urls_df['label'] == 0]['url'].tolist()
# 3. Extract domains from phishing URLs
def extract_domain(url):
try:
parsed = urlparse(url if url.startswith('http') else 'http://' + url)
return parsed.netloc.lower()
except:
return ''
phish_domains = [extract_domain(url) for url in phish_urls]
# 4. Find common domain patterns
print("\n=== TOP 50 PHISHING DOMAINS (by frequency) ===")
domain_counts = Counter(phish_domains)
for domain, count in domain_counts.most_common(50):
print(f"{domain:50s}: {count:5d}")
# 5. Find common suffixes (platforms)
print("\n=== COMMON DOMAIN SUFFIXES (platforms) ===")
suffixes = []
for domain in phish_domains:
parts = domain.split('.')
if len(parts) >= 2:
suffix = '.'.join(parts[-2:]) # Last 2 parts (e.g., weebly.com)
suffixes.append(suffix)
suffix_counts = Counter(suffixes)
print("\nTop 30 suffixes:")
for suffix, count in suffix_counts.most_common(30):
print(f"{suffix:30s}: {count:5d} ({count/len(phish_domains)*100:.1f}%)")