File size: 10,967 Bytes
a74b879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
"""Advanced Keyword Analytics with Topic Clustering and Density Analysis."""
import re
from collections import defaultdict
from typing import List, Dict, Tuple

try:
    import spacy
    _nlp = None
    try:
        _nlp = spacy.load('en_core_web_sm')
    except:
        pass
except:
    _nlp = None

# Extended stopwords for better filtering
STOPWORDS = set([
    'the', 'and', 'for', 'with', 'from', 'this', 'that', 'are', 'you', 'your',
    'www', 'http', 'https', 'com', 'org', 'net', 'page', 'pages', 'about',
    'more', 'shop', 'home', 'contact', 'search', 'menu', 'cart', 'login',
    'sign', 'account', 'view', 'add', 'buy', 'price', 'our', 'all', 'new',
    'get', 'now', 'here', 'click', 'read', 'see', 'find', 'back', 'next',
    'prev', 'skip', 'main', 'content', 'footer', 'header', 'sidebar', 'nav',
    'navigation', 'copyright', 'reserved', 'rights', 'privacy', 'terms',
    'conditions', 'policy', 'subscribe', 'newsletter', 'email', 'follow',
    'share', 'like', 'tweet', 'post', 'comment', 'reply', 'submit'
])

# Arabic stopwords
ARABIC_STOPWORDS = set([
    'ููŠ', 'ู…ู†', 'ุฅู„ู‰', 'ุนู„ู‰', 'ุนู†', 'ู…ุน', 'ู‡ุฐุง', 'ู‡ุฐู‡', 'ุงู„ุชูŠ', 'ุงู„ุฐูŠ',
    'ูƒู„', 'ุจุนุถ', 'ุฃูˆ', 'ู„ูƒู†', 'ุซู…', 'ู‚ุฏ', 'ูƒุงู†', 'ู„ู…', 'ู„ู†', 'ุฃู†', 'ุฅู†',
    'ู…ุง', 'ู„ุง', 'ู†ุญู†', 'ู‡ู…', 'ู‡ูŠ', 'ู‡ูˆ', 'ุฃู†ุช', 'ุฃู†ุง', 'ู†ุง', 'ูƒู…', 'ูƒูŠู'
])

ALL_STOPWORDS = STOPWORDS | ARABIC_STOPWORDS


def clean_keyword(kw: str) -> str:
    """Clean a keyword by removing brackets, parentheses, and extra whitespace."""
    # Remove brackets and parentheses
    kw = re.sub(r'[\[\]\(\)\{\}]', '', kw)
    # Remove leading/trailing punctuation
    kw = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', kw)
    # Normalize whitespace
    kw = ' '.join(kw.split())
    return kw.strip()


def is_valid_keyword(kw: str, min_length: int = 3) -> bool:
    """Check if a keyword is valid (not stopword, not too short, not junk)."""
    kw_lower = kw.lower()
    
    # Too short
    if len(kw) < min_length:
        return False
    
    # All stopwords
    words = kw_lower.split()
    if all(w in ALL_STOPWORDS for w in words):
        return False
    
    # Contains only numbers or special chars
    if re.match(r'^[\d\s\-\.\,]+$', kw):
        return False
    
    # Looks like navigation/UI text
    ui_patterns = [
        r'^\d+\s*(items?|products?|results?)',
        r'^(skip|back|next|prev|home|menu)',
        r'(copyright|reserved|privacy|terms)',
        r'^\d+\s*$',  # Just numbers
        r'^[\W_]+$',  # Just punctuation
    ]
    for pattern in ui_patterns:
        if re.search(pattern, kw_lower):
            return False
    
    return True


def calculate_keyword_density(keywords: List[Dict], total_words: int) -> List[Dict]:
    """Calculate keyword density percentage."""
    for kw in keywords:
        word_count = len(kw['kw'].split())
        frequency = kw['count']
        # Density = (keyword frequency ร— word count) / total words ร— 100
        density = (frequency * word_count / total_words * 100) if total_words > 0 else 0
        kw['density'] = round(density, 2)
    return keywords


def classify_keywords(keywords: List[Dict]) -> Dict[str, List[Dict]]:
    """Classify keywords into primary, secondary, and long-tail."""
    if not keywords:
        return {'primary': [], 'secondary': [], 'long_tail': []}
    
    # Sort by count
    sorted_kws = sorted(keywords, key=lambda x: x['count'], reverse=True)
    
    # Primary: top 20% by frequency
    primary_count = max(3, len(sorted_kws) // 5)
    primary = sorted_kws[:primary_count]
    
    # Secondary: next 30%
    secondary_count = max(5, len(sorted_kws) * 3 // 10)
    secondary = sorted_kws[primary_count:primary_count + secondary_count]
    
    # Long-tail: rest
    long_tail = sorted_kws[primary_count + secondary_count:]
    
    return {
        'primary': primary,
        'secondary': secondary,
        'long_tail': long_tail
    }


def cluster_by_topic(keywords: List[Dict]) -> Dict[str, List[Dict]]:
    """Group keywords by semantic topic."""
    # Simple topic clustering based on common words
    clusters = defaultdict(list)
    
    for kw_obj in keywords:
        kw = kw_obj['kw'].lower()
        
        # SEO-related
        if any(term in kw for term in ['seo', 'search', 'ranking', 'optimization', 'ู…ุญุฑูƒ', 'ุจุญุซ']):
            clusters['SEO & Search'].append(kw_obj)
        # E-commerce
        elif any(term in kw for term in ['shop', 'store', 'product', 'price', 'buy', 'ู…ุชุฌุฑ', 'ู…ู†ุชุฌ']):
            clusters['E-commerce'].append(kw_obj)
        # Content/Blog
        elif any(term in kw for term in ['blog', 'article', 'post', 'content', 'ู…ู‚ุงู„', 'ู…ุญุชูˆู‰']):
            clusters['Content'].append(kw_obj)
        # Location
        elif any(term in kw for term in ['city', 'location', 'address', 'ู…ุฏูŠู†ุฉ', 'ู…ูˆู‚ุน', 'ุนู†ูˆุงู†']):
            clusters['Location'].append(kw_obj)
        # Brand/Product names
        elif kw_obj['count'] >= 5 and len(kw.split()) <= 2:
            clusters['Brand/Product'].append(kw_obj)
        # General
        else:
            clusters['General'].append(kw_obj)
    
    # Remove empty clusters
    return {k: v for k, v in clusters.items() if v}


def calculate_coverage_score(keywords: List[Dict], expected_keywords: List[str]) -> Dict:
    """Calculate topic coverage score based on expected keywords."""
    found = set(kw['kw'].lower() for kw in keywords)
    expected = set(k.lower() for k in expected_keywords)
    
    matched = found & expected
    missing = expected - found
    
    coverage = (len(matched) / len(expected) * 100) if expected else 0
    
    return {
        'score': round(coverage, 1),
        'matched': list(matched),
        'missing': list(missing),
        'total_expected': len(expected),
        'total_found': len(matched)
    }


def analyze_keywords(keywords: List[Dict], total_words: int = 0, 
                     expected_keywords: List[str] = None) -> Dict:
    """
    Comprehensive keyword analysis with classification, clustering, and metrics.
    
    Args:
        keywords: List of keyword dicts with 'kw' and 'count'
        total_words: Total word count on page (for density calculation)
        expected_keywords: Expected keywords for coverage analysis
    
    Returns:
        Complete analytics report
    """
    # Clean and filter keywords
    cleaned = []
    for kw_obj in keywords:
        kw = clean_keyword(kw_obj['kw'])
        if is_valid_keyword(kw):
            cleaned.append({**kw_obj, 'kw': kw})
    
    # Remove duplicates (case-insensitive)
    seen = {}
    unique = []
    for kw_obj in cleaned:
        kw_lower = kw_obj['kw'].lower()
        if kw_lower not in seen:
            seen[kw_lower] = kw_obj
            unique.append(kw_obj)
        else:
            # Merge counts if duplicate
            seen[kw_lower]['count'] += kw_obj['count']
    
    # Re-sort by count
    unique = sorted(unique, key=lambda x: x['count'], reverse=True)
    
    # Calculate density
    if total_words > 0:
        unique = calculate_keyword_density(unique, total_words)
    
    # Classify keywords
    classification = classify_keywords(unique)
    
    # Cluster by topic
    clusters = cluster_by_topic(unique)
    
    # Calculate coverage if expected keywords provided
    coverage = None
    if expected_keywords:
        coverage = calculate_coverage_score(unique, expected_keywords)
    
    # Calculate metrics
    total_keywords = len(unique)
    avg_frequency = sum(k['count'] for k in unique) / total_keywords if total_keywords > 0 else 0
    
    # Top keywords (top 10)
    top_keywords = unique[:10]
    
    return {
        'summary': {
            'total_keywords': total_keywords,
            'total_words': total_words,
            'avg_frequency': round(avg_frequency, 1),
            'primary_keywords': len(classification['primary']),
            'secondary_keywords': len(classification['secondary']),
            'long_tail_keywords': len(classification['long_tail'])
        },
        'top_keywords': top_keywords,
        'classification': classification,
        'clusters': clusters,
        'coverage': coverage,
        'all_keywords': unique
    }


def format_analytics_report(analytics: Dict) -> str:
    """Format analytics as a readable text report."""
    lines = []
    lines.append("=" * 80)
    lines.append("KEYWORD ANALYTICS REPORT")
    lines.append("=" * 80)
    
    # Summary
    summary = analytics['summary']
    lines.append("\n๐Ÿ“Š SUMMARY")
    lines.append("-" * 80)
    lines.append(f"Total Keywords Found: {summary['total_keywords']}")
    lines.append(f"Total Words on Page: {summary['total_words']}")
    lines.append(f"Average Keyword Frequency: {summary['avg_frequency']}")
    lines.append(f"Primary Keywords: {summary['primary_keywords']}")
    lines.append(f"Secondary Keywords: {summary['secondary_keywords']}")
    lines.append(f"Long-tail Keywords: {summary['long_tail_keywords']}")
    
    # Top Keywords
    lines.append("\n๐Ÿ” TOP KEYWORDS")
    lines.append("-" * 80)
    lines.append(f"{'Keyword':<40} {'Frequency':<12} {'Density':<10}")
    lines.append("-" * 80)
    for kw in analytics['top_keywords']:
        density = f"{kw.get('density', 0):.2f}%" if 'density' in kw else 'N/A'
        lines.append(f"{kw['kw']:<40} {kw['count']:<12} {density:<10}")
    
    # Classification
    lines.append("\n๐Ÿ“‹ KEYWORD CLASSIFICATION")
    lines.append("-" * 80)
    
    classification = analytics['classification']
    
    lines.append("\n1๏ธโƒฃ PRIMARY KEYWORDS (High Priority)")
    for kw in classification['primary'][:5]:
        lines.append(f"  โ€ข {kw['kw']} ({kw['count']})")
    
    lines.append("\n2๏ธโƒฃ SECONDARY KEYWORDS (Medium Priority)")
    for kw in classification['secondary'][:5]:
        lines.append(f"  โ€ข {kw['kw']} ({kw['count']})")
    
    lines.append("\n3๏ธโƒฃ LONG-TAIL KEYWORDS (Low Priority)")
    for kw in classification['long_tail'][:5]:
        lines.append(f"  โ€ข {kw['kw']} ({kw['count']})")
    
    # Topic Clusters
    lines.append("\n๐ŸŽฏ TOPIC CLUSTERS")
    lines.append("-" * 80)
    for topic, kws in analytics['clusters'].items():
        lines.append(f"\n{topic} ({len(kws)} keywords)")
        for kw in kws[:3]:
            lines.append(f"  โ€ข {kw['kw']} ({kw['count']})")
    
    # Coverage
    if analytics.get('coverage'):
        coverage = analytics['coverage']
        lines.append("\n๐Ÿ“ˆ TOPIC COVERAGE")
        lines.append("-" * 80)
        lines.append(f"Coverage Score: {coverage['score']}%")
        lines.append(f"Matched Keywords: {coverage['total_found']}/{coverage['total_expected']}")
        if coverage['missing']:
            lines.append("\nMissing Keywords:")
            for kw in coverage['missing'][:5]:
                lines.append(f"  โŒ {kw}")
    
    lines.append("\n" + "=" * 80)
    return "\n".join(lines)