rb1337's picture
Upload 50 files
2cc7f91 verified
import pandas as pd
from urllib.parse import urlparse
from collections import Counter
import re
# Load detailed results
results_df = pd.read_csv('data/raw/clean_dataset_no_duplicates.csv')
print("=== DETAILED 'OTHER' CATEGORY ANALYSIS ===\n")
# Filter only 'other' type
other_df = results_df[results_df['type'] == 'other']
# 1. TLD distribution
print("TOP 20 TLDs in 'OTHER' category:")
tlds = other_df['domain'].apply(lambda d: '.' + d.split('.')[-1] if '.' in d else '')
tld_counts = Counter(tlds)
for tld, count in tld_counts.most_common(20):
pct = (count / len(other_df)) * 100
print(f" {tld:10s}: {count:5d} ({pct:4.1f}%)")
# 2. Domain length distribution
print("\n=== DOMAIN LENGTH DISTRIBUTION (OTHER) ===")
lengths = other_df['domain'].str.len()
print(f"Min: {lengths.min()}")
print(f"25%: {lengths.quantile(0.25):.0f}")
print(f"50%: {lengths.median():.0f}")
print(f"75%: {lengths.quantile(0.75):.0f}")
print(f"Max: {lengths.max()}")
# 3. Check for non-English brands/keywords
print("\n=== POTENTIAL NON-ENGLISH BRANDS/KEYWORDS ===")
# Common patterns in 'other'
all_domains = ' '.join(other_df['domain'].tolist()).lower()
# Find common substrings
from collections import defaultdict
substring_counts = defaultdict(int)
for domain in other_df['domain']:
domain = domain.lower()
# Extract words (split by dots, hyphens)
parts = re.split(r'[\.\-_]', domain)
for part in parts:
if len(part) >= 5: # Min 5 chars
substring_counts[part] += 1
# Top recurring words
print("Top 30 recurring words in domains:")
for word, count in sorted(substring_counts.items(), key=lambda x: x[1], reverse=True)[:30]:
if count >= 10: # Appears at least 10 times
print(f" {word:30s}: {count:4d} occurrences")
# 4. Digit patterns
print("\n=== DIGIT PATTERNS ===")
has_digits = other_df['domain'].str.contains(r'\d')
print(f"Domains with digits: {has_digits.sum()} / {len(other_df)} ({has_digits.sum()/len(other_df)*100:.1f}%)")
# 5. Length of longest part
print("\n=== LONGEST DOMAIN PART ===")
longest_parts = other_df['domain'].apply(lambda d: max(d.split('.'), key=len))
longest_part_lens = longest_parts.str.len()
print(f"Avg longest part: {longest_part_lens.mean():.1f} chars")
print(f"Median longest part: {longest_part_lens.median():.1f} chars")
# Show some examples of long domains
print("\nExamples of domains with longest part > 30 chars:")
long_domains = other_df[longest_part_lens > 30]['url'].head(10)
for url in long_domains:
print(f" {url[:100]}...")
# 6. Hyphen analysis
print("\n=== HYPHEN ANALYSIS ===")
hyphen_counts = other_df['domain'].str.count('-')
print(f"Avg hyphens per domain: {hyphen_counts.mean():.2f}")
print(f"Domains with 3+ hyphens: {(hyphen_counts >= 3).sum()} ({(hyphen_counts >= 3).sum()/len(other_df)*100:.1f}%)")
# 7. Subdomain analysis
print("\n=== SUBDOMAIN ANALYSIS ===")
num_parts = other_df['domain'].str.count(r'\.') + 1
num_subdomains = num_parts - 2
print(f"Domains with 2+ subdomains: {(num_subdomains >= 2).sum()} ({(num_subdomains >= 2).sum()/len(other_df)*100:.1f}%)")
print("\n✅ Analysis complete!")