File size: 5,084 Bytes
2cc7f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import pandas as pd
from urllib.parse import urlparse
import re

# Load phishing URLs
phish_df = pd.read_csv('phishing_urls.csv')

print("=== PHISHING DATASET INFO ===")
print(f"Total phishing URLs: {len(phish_df)}")
print(f"Columns: {phish_df.columns.tolist()}\n")

# Assume URL column is 'url' (adjust if different)
url_column = 'url'  # Change to your actual column name

print("=== PHISHING TYPE ANALYSIS (from raw URLs) ===\n")

# Function to analyze URL
def analyze_phishing_type(url):
    """Determine phishing type from raw URL"""
    url = str(url).lower()
    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path
    
    result = {
        'url': url,
        'domain': domain,
        'path': path,
        'type': 'unknown'
    }
    
    # 1. IP-based phishing
    ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
    if re.search(ip_pattern, domain):
        result['type'] = 'ip_based'
        return result
    
    # 2. Brand impersonation (check for known brands)
    brands = [
        'paypal', 'amazon', 'apple', 'google', 'microsoft', 'facebook',
        'netflix', 'ebay', 'instagram', 'twitter', 'linkedin', 'bank',
        'chase', 'wellsfargo', 'citi', 'americanexpress', 'visa', 'mastercard',
        'dhl', 'fedex', 'ups', 'usps', 'alibaba', 'walmart', 'adobe',
        'dropbox', 'office365', 'outlook', 'yahoo', 'aol', 'whatsapp'
    ]
    
    # Check if brand in URL but not in main domain
    url_full = domain + path
    sld = domain.split('.')[-2] if len(domain.split('.')) >= 2 else domain
    
    brand_found = None
    for brand in brands:
        if brand in url_full:
            brand_found = brand
            # Check if brand is actually the main domain
            if brand == sld or sld.startswith(brand):
                # Legitimate brand usage
                break
            else:
                # Brand impersonation
                result['type'] = 'brand_impersonation'
                result['brand'] = brand
                return result
    
    # 3. Phishing keywords (generic phishing)
    phishing_keywords = [
        'login', 'signin', 'verify', 'account', 'update', 'secure',
        'confirm', 'suspended', 'locked', 'alert', 'urgent', 'validate',
        'banking', 'credential', 'auth', 'password', 'restore', 'recover'
    ]
    
    keyword_count = sum(1 for kw in phishing_keywords if kw in url_full)
    if keyword_count >= 2:
        result['type'] = 'generic_phishing'
        result['keyword_count'] = keyword_count
        return result
    
    # 4. Suspicious TLD
    suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', '.work', '.click']
    if any(tld in domain for tld in suspicious_tlds):
        result['type'] = 'suspicious_tld'
        return result
    
    # 5. Compromised site (trusted TLD + phishing in path)
    trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov']
    if any(tld in domain for tld in trusted_tlds):
        if any(kw in path for kw in phishing_keywords):
            result['type'] = 'compromised_site'
            return result
    
    # Default
    result['type'] = 'other'
    return result

# Analyze all URLs
print("Analyzing URLs... (this may take a minute)")
results = []
for url in phish_df[url_column]:
    results.append(analyze_phishing_type(url))

results_df = pd.DataFrame(results)

# Count types
type_counts = results_df['type'].value_counts()

print("\n=== PHISHING TYPE DISTRIBUTION ===")
for ptype, count in type_counts.items():
    percentage = (count / len(phish_df)) * 100
    print(f"{ptype:20s}: {count:6d} / {len(phish_df)} ({percentage:5.1f}%)")

# Domain characteristics
print("\n=== DOMAIN CHARACTERISTICS ===")

# Domain lengths
domain_lengths = results_df['domain'].apply(len)
print(f"Avg domain length: {domain_lengths.mean():.1f} chars")
print(f"Median domain length: {domain_lengths.median():.1f} chars")

# Number of domain parts
num_parts = results_df['domain'].apply(lambda d: len(d.split('.')))
print(f"Avg domain parts: {num_parts.mean():.1f}")
print(f"Median domain parts: {num_parts.median():.1f}")

# Number of subdomains
num_subdomains = num_parts - 2  # Subtract SLD and TLD
print(f"Avg subdomains: {num_subdomains.mean():.1f}")

# Path characteristics
print("\n=== PATH CHARACTERISTICS ===")
path_lengths = results_df['path'].apply(len)
print(f"Avg path length: {path_lengths.mean():.1f} chars")
print(f"URLs with paths: {(path_lengths > 1).sum()} / {len(phish_df)} ({(path_lengths > 1).sum()/len(phish_df)*100:.1f}%)")

# Show examples
print("\n=== EXAMPLES BY TYPE ===")
for ptype in type_counts.index[:5]:
    examples = results_df[results_df['type'] == ptype]['url'].head(3)
    print(f"\n{ptype.upper()}:")
    for i, ex in enumerate(examples, 1):
        print(f"  {i}. {ex[:100]}...")

# Save detailed results
results_df.to_csv('phishing_type_analysis.csv', index=False)
print("\n✅ Detailed results saved to: phishing_type_analysis.csv")