Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from urllib.parse import urlparse | |
| from collections import Counter | |
| import re | |
| # Load detailed results | |
| results_df = pd.read_csv('data/raw/clean_dataset_no_duplicates.csv') | |
| print("=== DETAILED 'OTHER' CATEGORY ANALYSIS ===\n") | |
| # Filter only 'other' type | |
| other_df = results_df[results_df['type'] == 'other'] | |
| # 1. TLD distribution | |
| print("TOP 20 TLDs in 'OTHER' category:") | |
| tlds = other_df['domain'].apply(lambda d: '.' + d.split('.')[-1] if '.' in d else '') | |
| tld_counts = Counter(tlds) | |
| for tld, count in tld_counts.most_common(20): | |
| pct = (count / len(other_df)) * 100 | |
| print(f" {tld:10s}: {count:5d} ({pct:4.1f}%)") | |
| # 2. Domain length distribution | |
| print("\n=== DOMAIN LENGTH DISTRIBUTION (OTHER) ===") | |
| lengths = other_df['domain'].str.len() | |
| print(f"Min: {lengths.min()}") | |
| print(f"25%: {lengths.quantile(0.25):.0f}") | |
| print(f"50%: {lengths.median():.0f}") | |
| print(f"75%: {lengths.quantile(0.75):.0f}") | |
| print(f"Max: {lengths.max()}") | |
| # 3. Check for non-English brands/keywords | |
| print("\n=== POTENTIAL NON-ENGLISH BRANDS/KEYWORDS ===") | |
| # Common patterns in 'other' | |
| all_domains = ' '.join(other_df['domain'].tolist()).lower() | |
| # Find common substrings | |
| from collections import defaultdict | |
| substring_counts = defaultdict(int) | |
| for domain in other_df['domain']: | |
| domain = domain.lower() | |
| # Extract words (split by dots, hyphens) | |
| parts = re.split(r'[\.\-_]', domain) | |
| for part in parts: | |
| if len(part) >= 5: # Min 5 chars | |
| substring_counts[part] += 1 | |
| # Top recurring words | |
| print("Top 30 recurring words in domains:") | |
| for word, count in sorted(substring_counts.items(), key=lambda x: x[1], reverse=True)[:30]: | |
| if count >= 10: # Appears at least 10 times | |
| print(f" {word:30s}: {count:4d} occurrences") | |
| # 4. Digit patterns | |
| print("\n=== DIGIT PATTERNS ===") | |
| has_digits = other_df['domain'].str.contains(r'\d') | |
| print(f"Domains with digits: {has_digits.sum()} / {len(other_df)} ({has_digits.sum()/len(other_df)*100:.1f}%)") | |
| # 5. Length of longest part | |
| print("\n=== LONGEST DOMAIN PART ===") | |
| longest_parts = other_df['domain'].apply(lambda d: max(d.split('.'), key=len)) | |
| longest_part_lens = longest_parts.str.len() | |
| print(f"Avg longest part: {longest_part_lens.mean():.1f} chars") | |
| print(f"Median longest part: {longest_part_lens.median():.1f} chars") | |
| # Show some examples of long domains | |
| print("\nExamples of domains with longest part > 30 chars:") | |
| long_domains = other_df[longest_part_lens > 30]['url'].head(10) | |
| for url in long_domains: | |
| print(f" {url[:100]}...") | |
| # 6. Hyphen analysis | |
| print("\n=== HYPHEN ANALYSIS ===") | |
| hyphen_counts = other_df['domain'].str.count('-') | |
| print(f"Avg hyphens per domain: {hyphen_counts.mean():.2f}") | |
| print(f"Domains with 3+ hyphens: {(hyphen_counts >= 3).sum()} ({(hyphen_counts >= 3).sum()/len(other_df)*100:.1f}%)") | |
| # 7. Subdomain analysis | |
| print("\n=== SUBDOMAIN ANALYSIS ===") | |
| num_parts = other_df['domain'].str.count(r'\.') + 1 | |
| num_subdomains = num_parts - 2 | |
| print(f"Domains with 2+ subdomains: {(num_subdomains >= 2).sum()} ({(num_subdomains >= 2).sum()/len(other_df)*100:.1f}%)") | |
| print("\n✅ Analysis complete!") |