Spaces:

rb1337
/

Phishing-Detection-System

Runtime error

App Files Files Community

Phishing-Detection-System / scripts /phishing_analysis /analysis.py

rb1337

Upload 50 files

2cc7f91 verified 8 days ago

raw

history blame contribute delete

5.08 kB

	import pandas as pd
	from urllib.parse import urlparse
	import re

	# Load phishing URLs
	phish_df = pd.read_csv('phishing_urls.csv')

	print("=== PHISHING DATASET INFO ===")
	print(f"Total phishing URLs: {len(phish_df)}")
	print(f"Columns: {phish_df.columns.tolist()}\n")

	# Assume URL column is 'url' (adjust if different)
	url_column = 'url' # Change to your actual column name

	print("=== PHISHING TYPE ANALYSIS (from raw URLs) ===\n")

	# Function to analyze URL
	def analyze_phishing_type(url):
	"""Determine phishing type from raw URL"""
	url = str(url).lower()
	parsed = urlparse(url)
	domain = parsed.netloc
	path = parsed.path

	result = {
	'url': url,
	'domain': domain,
	'path': path,
	'type': 'unknown'
	}

	# 1. IP-based phishing
	ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
	if re.search(ip_pattern, domain):
	result['type'] = 'ip_based'
	return result

	# 2. Brand impersonation (check for known brands)
	brands = [
	'paypal', 'amazon', 'apple', 'google', 'microsoft', 'facebook',
	'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'bank',
	'chase', 'wellsfargo', 'citi', 'americanexpress', 'visa', 'mastercard',
	'dhl', 'fedex', 'ups', 'usps', 'alibaba', 'walmart', 'adobe',
	'dropbox', 'office365', 'outlook', 'yahoo', 'aol', 'whatsapp'
	]

	# Check if brand in URL but not in main domain
	url_full = domain + path
	sld = domain.split('.')[-2] if len(domain.split('.')) >= 2 else domain

	brand_found = None
	for brand in brands:
	if brand in url_full:
	brand_found = brand
	# Check if brand is actually the main domain
	if brand == sld or sld.startswith(brand):
	# Legitimate brand usage
	break
	else:
	# Brand impersonation
	result['type'] = 'brand_impersonation'
	result['brand'] = brand
	return result

	# 3. Phishing keywords (generic phishing)
	phishing_keywords = [
	'login', 'signin', 'verify', 'account', 'update', 'secure',
	'confirm', 'suspended', 'locked', 'alert', 'urgent', 'validate',
	'banking', 'credential', 'auth', 'password', 'restore', 'recover'
	]

	keyword_count = sum(1 for kw in phishing_keywords if kw in url_full)
	if keyword_count >= 2:
	result['type'] = 'generic_phishing'
	result['keyword_count'] = keyword_count
	return result

	# 4. Suspicious TLD
	suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
	if any(tld in domain for tld in suspicious_tlds):
	result['type'] = 'suspicious_tld'
	return result

	# 5. Compromised site (trusted TLD + phishing in path)
	trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov']
	if any(tld in domain for tld in trusted_tlds):
	if any(kw in path for kw in phishing_keywords):
	result['type'] = 'compromised_site'
	return result

	# Default
	result['type'] = 'other'
	return result

	# Analyze all URLs
	print("Analyzing URLs... (this may take a minute)")
	results = []
	for url in phish_df[url_column]:
	results.append(analyze_phishing_type(url))

	results_df = pd.DataFrame(results)

	# Count types
	type_counts = results_df['type'].value_counts()

	print("\n=== PHISHING TYPE DISTRIBUTION ===")
	for ptype, count in type_counts.items():
	percentage = (count / len(phish_df)) * 100
	print(f"{ptype:20s}: {count:6d} / {len(phish_df)} ({percentage:5.1f}%)")

	# Domain characteristics
	print("\n=== DOMAIN CHARACTERISTICS ===")

	# Domain lengths
	domain_lengths = results_df['domain'].apply(len)
	print(f"Avg domain length: {domain_lengths.mean():.1f} chars")
	print(f"Median domain length: {domain_lengths.median():.1f} chars")

	# Number of domain parts
	num_parts = results_df['domain'].apply(lambda d: len(d.split('.')))
	print(f"Avg domain parts: {num_parts.mean():.1f}")
	print(f"Median domain parts: {num_parts.median():.1f}")

	# Number of subdomains
	num_subdomains = num_parts - 2 # Subtract SLD and TLD
	print(f"Avg subdomains: {num_subdomains.mean():.1f}")

	# Path characteristics
	print("\n=== PATH CHARACTERISTICS ===")
	path_lengths = results_df['path'].apply(len)
	print(f"Avg path length: {path_lengths.mean():.1f} chars")
	print(f"URLs with paths: {(path_lengths > 1).sum()} / {len(phish_df)} ({(path_lengths > 1).sum()/len(phish_df)*100:.1f}%)")

	# Show examples
	print("\n=== EXAMPLES BY TYPE ===")
	for ptype in type_counts.index[:5]:
	examples = results_df[results_df['type'] == ptype]['url'].head(3)
	print(f"\n{ptype.upper()}:")
	for i, ex in enumerate(examples, 1):
	print(f" {i}. {ex[:100]}...")

	# Save detailed results
	results_df.to_csv('phishing_type_analysis.csv', index=False)
	print("\n✅ Detailed results saved to: phishing_type_analysis.csv")