| """Advanced Keyword Analytics with Topic Clustering and Density Analysis.""" |
| import re |
| from collections import defaultdict |
| from typing import List, Dict, Tuple |
|
|
| try: |
| import spacy |
| _nlp = None |
| try: |
| _nlp = spacy.load('en_core_web_sm') |
| except: |
| pass |
| except: |
| _nlp = None |
|
|
| |
| STOPWORDS = set([ |
| 'the', 'and', 'for', 'with', 'from', 'this', 'that', 'are', 'you', 'your', |
| 'www', 'http', 'https', 'com', 'org', 'net', 'page', 'pages', 'about', |
| 'more', 'shop', 'home', 'contact', 'search', 'menu', 'cart', 'login', |
| 'sign', 'account', 'view', 'add', 'buy', 'price', 'our', 'all', 'new', |
| 'get', 'now', 'here', 'click', 'read', 'see', 'find', 'back', 'next', |
| 'prev', 'skip', 'main', 'content', 'footer', 'header', 'sidebar', 'nav', |
| 'navigation', 'copyright', 'reserved', 'rights', 'privacy', 'terms', |
| 'conditions', 'policy', 'subscribe', 'newsletter', 'email', 'follow', |
| 'share', 'like', 'tweet', 'post', 'comment', 'reply', 'submit' |
| ]) |
|
|
| |
| ARABIC_STOPWORDS = set([ |
| 'في', 'من', 'إلى', 'على', 'عن', 'مع', 'هذا', 'هذه', 'التي', 'الذي', |
| 'كل', 'بعض', 'أو', 'لكن', 'ثم', 'قد', 'كان', 'لم', 'لن', 'أن', 'إن', |
| 'ما', 'لا', 'نحن', 'هم', 'هي', 'هو', 'أنت', 'أنا', 'نا', 'كم', 'كيف' |
| ]) |
|
|
| ALL_STOPWORDS = STOPWORDS | ARABIC_STOPWORDS |
|
|
|
|
| def clean_keyword(kw: str) -> str: |
| """Clean a keyword by removing brackets, parentheses, and extra whitespace.""" |
| |
| kw = re.sub(r'[\[\]\(\)\{\}]', '', kw) |
| |
| kw = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', kw) |
| |
| kw = ' '.join(kw.split()) |
| return kw.strip() |
|
|
|
|
| def is_valid_keyword(kw: str, min_length: int = 3) -> bool: |
| """Check if a keyword is valid (not stopword, not too short, not junk).""" |
| kw_lower = kw.lower() |
| |
| |
| if len(kw) < min_length: |
| return False |
| |
| |
| words = kw_lower.split() |
| if all(w in ALL_STOPWORDS for w in words): |
| return False |
| |
| |
| if re.match(r'^[\d\s\-\.\,]+$', kw): |
| return False |
| |
| |
| ui_patterns = [ |
| r'^\d+\s*(items?|products?|results?)', |
| r'^(skip|back|next|prev|home|menu)', |
| r'(copyright|reserved|privacy|terms)', |
| r'^\d+\s*$', |
| r'^[\W_]+$', |
| ] |
| for pattern in ui_patterns: |
| if re.search(pattern, kw_lower): |
| return False |
| |
| return True |
|
|
|
|
| def calculate_keyword_density(keywords: List[Dict], total_words: int) -> List[Dict]: |
| """Calculate keyword density percentage.""" |
| for kw in keywords: |
| word_count = len(kw['kw'].split()) |
| frequency = kw['count'] |
| |
| density = (frequency * word_count / total_words * 100) if total_words > 0 else 0 |
| kw['density'] = round(density, 2) |
| return keywords |
|
|
|
|
| def classify_keywords(keywords: List[Dict]) -> Dict[str, List[Dict]]: |
| """Classify keywords into primary, secondary, and long-tail.""" |
| if not keywords: |
| return {'primary': [], 'secondary': [], 'long_tail': []} |
| |
| |
| sorted_kws = sorted(keywords, key=lambda x: x['count'], reverse=True) |
| |
| |
| primary_count = max(3, len(sorted_kws) // 5) |
| primary = sorted_kws[:primary_count] |
| |
| |
| secondary_count = max(5, len(sorted_kws) * 3 // 10) |
| secondary = sorted_kws[primary_count:primary_count + secondary_count] |
| |
| |
| long_tail = sorted_kws[primary_count + secondary_count:] |
| |
| return { |
| 'primary': primary, |
| 'secondary': secondary, |
| 'long_tail': long_tail |
| } |
|
|
|
|
| def cluster_by_topic(keywords: List[Dict]) -> Dict[str, List[Dict]]: |
| """Group keywords by semantic topic.""" |
| |
| clusters = defaultdict(list) |
| |
| for kw_obj in keywords: |
| kw = kw_obj['kw'].lower() |
| |
| |
| if any(term in kw for term in ['seo', 'search', 'ranking', 'optimization', 'محرك', 'بحث']): |
| clusters['SEO & Search'].append(kw_obj) |
| |
| elif any(term in kw for term in ['shop', 'store', 'product', 'price', 'buy', 'متجر', 'منتج']): |
| clusters['E-commerce'].append(kw_obj) |
| |
| elif any(term in kw for term in ['blog', 'article', 'post', 'content', 'مقال', 'محتوى']): |
| clusters['Content'].append(kw_obj) |
| |
| elif any(term in kw for term in ['city', 'location', 'address', 'مدينة', 'موقع', 'عنوان']): |
| clusters['Location'].append(kw_obj) |
| |
| elif kw_obj['count'] >= 5 and len(kw.split()) <= 2: |
| clusters['Brand/Product'].append(kw_obj) |
| |
| else: |
| clusters['General'].append(kw_obj) |
| |
| |
| return {k: v for k, v in clusters.items() if v} |
|
|
|
|
| def calculate_coverage_score(keywords: List[Dict], expected_keywords: List[str]) -> Dict: |
| """Calculate topic coverage score based on expected keywords.""" |
| found = set(kw['kw'].lower() for kw in keywords) |
| expected = set(k.lower() for k in expected_keywords) |
| |
| matched = found & expected |
| missing = expected - found |
| |
| coverage = (len(matched) / len(expected) * 100) if expected else 0 |
| |
| return { |
| 'score': round(coverage, 1), |
| 'matched': list(matched), |
| 'missing': list(missing), |
| 'total_expected': len(expected), |
| 'total_found': len(matched) |
| } |
|
|
|
|
| def analyze_keywords(keywords: List[Dict], total_words: int = 0, |
| expected_keywords: List[str] = None) -> Dict: |
| """ |
| Comprehensive keyword analysis with classification, clustering, and metrics. |
| |
| Args: |
| keywords: List of keyword dicts with 'kw' and 'count' |
| total_words: Total word count on page (for density calculation) |
| expected_keywords: Expected keywords for coverage analysis |
| |
| Returns: |
| Complete analytics report |
| """ |
| |
| cleaned = [] |
| for kw_obj in keywords: |
| kw = clean_keyword(kw_obj['kw']) |
| if is_valid_keyword(kw): |
| cleaned.append({**kw_obj, 'kw': kw}) |
| |
| |
| seen = {} |
| unique = [] |
| for kw_obj in cleaned: |
| kw_lower = kw_obj['kw'].lower() |
| if kw_lower not in seen: |
| seen[kw_lower] = kw_obj |
| unique.append(kw_obj) |
| else: |
| |
| seen[kw_lower]['count'] += kw_obj['count'] |
| |
| |
| unique = sorted(unique, key=lambda x: x['count'], reverse=True) |
| |
| |
| if total_words > 0: |
| unique = calculate_keyword_density(unique, total_words) |
| |
| |
| classification = classify_keywords(unique) |
| |
| |
| clusters = cluster_by_topic(unique) |
| |
| |
| coverage = None |
| if expected_keywords: |
| coverage = calculate_coverage_score(unique, expected_keywords) |
| |
| |
| total_keywords = len(unique) |
| avg_frequency = sum(k['count'] for k in unique) / total_keywords if total_keywords > 0 else 0 |
| |
| |
| top_keywords = unique[:10] |
| |
| return { |
| 'summary': { |
| 'total_keywords': total_keywords, |
| 'total_words': total_words, |
| 'avg_frequency': round(avg_frequency, 1), |
| 'primary_keywords': len(classification['primary']), |
| 'secondary_keywords': len(classification['secondary']), |
| 'long_tail_keywords': len(classification['long_tail']) |
| }, |
| 'top_keywords': top_keywords, |
| 'classification': classification, |
| 'clusters': clusters, |
| 'coverage': coverage, |
| 'all_keywords': unique |
| } |
|
|
|
|
| def format_analytics_report(analytics: Dict) -> str: |
| """Format analytics as a readable text report.""" |
| lines = [] |
| lines.append("=" * 80) |
| lines.append("KEYWORD ANALYTICS REPORT") |
| lines.append("=" * 80) |
| |
| |
| summary = analytics['summary'] |
| lines.append("\n📊 SUMMARY") |
| lines.append("-" * 80) |
| lines.append(f"Total Keywords Found: {summary['total_keywords']}") |
| lines.append(f"Total Words on Page: {summary['total_words']}") |
| lines.append(f"Average Keyword Frequency: {summary['avg_frequency']}") |
| lines.append(f"Primary Keywords: {summary['primary_keywords']}") |
| lines.append(f"Secondary Keywords: {summary['secondary_keywords']}") |
| lines.append(f"Long-tail Keywords: {summary['long_tail_keywords']}") |
| |
| |
| lines.append("\n🔝 TOP KEYWORDS") |
| lines.append("-" * 80) |
| lines.append(f"{'Keyword':<40} {'Frequency':<12} {'Density':<10}") |
| lines.append("-" * 80) |
| for kw in analytics['top_keywords']: |
| density = f"{kw.get('density', 0):.2f}%" if 'density' in kw else 'N/A' |
| lines.append(f"{kw['kw']:<40} {kw['count']:<12} {density:<10}") |
| |
| |
| lines.append("\n📋 KEYWORD CLASSIFICATION") |
| lines.append("-" * 80) |
| |
| classification = analytics['classification'] |
| |
| lines.append("\n1️⃣ PRIMARY KEYWORDS (High Priority)") |
| for kw in classification['primary'][:5]: |
| lines.append(f" • {kw['kw']} ({kw['count']})") |
| |
| lines.append("\n2️⃣ SECONDARY KEYWORDS (Medium Priority)") |
| for kw in classification['secondary'][:5]: |
| lines.append(f" • {kw['kw']} ({kw['count']})") |
| |
| lines.append("\n3️⃣ LONG-TAIL KEYWORDS (Low Priority)") |
| for kw in classification['long_tail'][:5]: |
| lines.append(f" • {kw['kw']} ({kw['count']})") |
| |
| |
| lines.append("\n🎯 TOPIC CLUSTERS") |
| lines.append("-" * 80) |
| for topic, kws in analytics['clusters'].items(): |
| lines.append(f"\n{topic} ({len(kws)} keywords)") |
| for kw in kws[:3]: |
| lines.append(f" • {kw['kw']} ({kw['count']})") |
| |
| |
| if analytics.get('coverage'): |
| coverage = analytics['coverage'] |
| lines.append("\n📈 TOPIC COVERAGE") |
| lines.append("-" * 80) |
| lines.append(f"Coverage Score: {coverage['score']}%") |
| lines.append(f"Matched Keywords: {coverage['total_found']}/{coverage['total_expected']}") |
| if coverage['missing']: |
| lines.append("\nMissing Keywords:") |
| for kw in coverage['missing'][:5]: |
| lines.append(f" ❌ {kw}") |
| |
| lines.append("\n" + "=" * 80) |
| return "\n".join(lines) |
|
|