File size: 3,199 Bytes
2cc7f91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
from urllib.parse import urlparse
from collections import Counter
import re

# Load detailed results
results_df = pd.read_csv('data/raw/clean_dataset_no_duplicates.csv')

print("=== DETAILED 'OTHER' CATEGORY ANALYSIS ===\n")

# Filter only 'other' type
other_df = results_df[results_df['type'] == 'other']

# 1. TLD distribution
print("TOP 20 TLDs in 'OTHER' category:")
tlds = other_df['domain'].apply(lambda d: '.' + d.split('.')[-1] if '.' in d else '')
tld_counts = Counter(tlds)
for tld, count in tld_counts.most_common(20):
    pct = (count / len(other_df)) * 100
    print(f"  {tld:10s}: {count:5d} ({pct:4.1f}%)")

# 2. Domain length distribution
print("\n=== DOMAIN LENGTH DISTRIBUTION (OTHER) ===")
lengths = other_df['domain'].str.len()
print(f"Min: {lengths.min()}")
print(f"25%: {lengths.quantile(0.25):.0f}")
print(f"50%: {lengths.median():.0f}")
print(f"75%: {lengths.quantile(0.75):.0f}")
print(f"Max: {lengths.max()}")

# 3. Check for non-English brands/keywords
print("\n=== POTENTIAL NON-ENGLISH BRANDS/KEYWORDS ===")

# Common patterns in 'other'
all_domains = ' '.join(other_df['domain'].tolist()).lower()

# Find common substrings
from collections import defaultdict
substring_counts = defaultdict(int)

for domain in other_df['domain']:
    domain = domain.lower()
    # Extract words (split by dots, hyphens)
    parts = re.split(r'[\.\-_]', domain)
    for part in parts:
        if len(part) >= 5:  # Min 5 chars
            substring_counts[part] += 1

# Top recurring words
print("Top 30 recurring words in domains:")
for word, count in sorted(substring_counts.items(), key=lambda x: x[1], reverse=True)[:30]:
    if count >= 10:  # Appears at least 10 times
        print(f"  {word:30s}: {count:4d} occurrences")

# 4. Digit patterns
print("\n=== DIGIT PATTERNS ===")
has_digits = other_df['domain'].str.contains(r'\d')
print(f"Domains with digits: {has_digits.sum()} / {len(other_df)} ({has_digits.sum()/len(other_df)*100:.1f}%)")

# 5. Length of longest part
print("\n=== LONGEST DOMAIN PART ===")
longest_parts = other_df['domain'].apply(lambda d: max(d.split('.'), key=len))
longest_part_lens = longest_parts.str.len()
print(f"Avg longest part: {longest_part_lens.mean():.1f} chars")
print(f"Median longest part: {longest_part_lens.median():.1f} chars")

# Show some examples of long domains
print("\nExamples of domains with longest part > 30 chars:")
long_domains = other_df[longest_part_lens > 30]['url'].head(10)
for url in long_domains:
    print(f"  {url[:100]}...")

# 6. Hyphen analysis
print("\n=== HYPHEN ANALYSIS ===")
hyphen_counts = other_df['domain'].str.count('-')
print(f"Avg hyphens per domain: {hyphen_counts.mean():.2f}")
print(f"Domains with 3+ hyphens: {(hyphen_counts >= 3).sum()} ({(hyphen_counts >= 3).sum()/len(other_df)*100:.1f}%)")

# 7. Subdomain analysis
print("\n=== SUBDOMAIN ANALYSIS ===")
num_parts = other_df['domain'].str.count(r'\.') + 1
num_subdomains = num_parts - 2
print(f"Domains with 2+ subdomains: {(num_subdomains >= 2).sum()} ({(num_subdomains >= 2).sum()/len(other_df)*100:.1f}%)")

print("\n✅ Analysis complete!")