File size: 10,967 Bytes
a74b879 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | """Advanced Keyword Analytics with Topic Clustering and Density Analysis."""
import re
from collections import defaultdict
from typing import List, Dict, Tuple
try:
import spacy
_nlp = None
try:
_nlp = spacy.load('en_core_web_sm')
except:
pass
except:
_nlp = None
# Extended stopwords for better filtering
STOPWORDS = set([
'the', 'and', 'for', 'with', 'from', 'this', 'that', 'are', 'you', 'your',
'www', 'http', 'https', 'com', 'org', 'net', 'page', 'pages', 'about',
'more', 'shop', 'home', 'contact', 'search', 'menu', 'cart', 'login',
'sign', 'account', 'view', 'add', 'buy', 'price', 'our', 'all', 'new',
'get', 'now', 'here', 'click', 'read', 'see', 'find', 'back', 'next',
'prev', 'skip', 'main', 'content', 'footer', 'header', 'sidebar', 'nav',
'navigation', 'copyright', 'reserved', 'rights', 'privacy', 'terms',
'conditions', 'policy', 'subscribe', 'newsletter', 'email', 'follow',
'share', 'like', 'tweet', 'post', 'comment', 'reply', 'submit'
])
# Arabic stopwords
ARABIC_STOPWORDS = set([
'ูู', 'ู
ู', 'ุฅูู', 'ุนูู', 'ุนู', 'ู
ุน', 'ูุฐุง', 'ูุฐู', 'ุงูุชู', 'ุงูุฐู',
'ูู', 'ุจุนุถ', 'ุฃู', 'ููู', 'ุซู
', 'ูุฏ', 'ูุงู', 'ูู
', 'ูู', 'ุฃู', 'ุฅู',
'ู
ุง', 'ูุง', 'ูุญู', 'ูู
', 'ูู', 'ูู', 'ุฃูุช', 'ุฃูุง', 'ูุง', 'ูู
', 'ููู'
])
ALL_STOPWORDS = STOPWORDS | ARABIC_STOPWORDS
def clean_keyword(kw: str) -> str:
"""Clean a keyword by removing brackets, parentheses, and extra whitespace."""
# Remove brackets and parentheses
kw = re.sub(r'[\[\]\(\)\{\}]', '', kw)
# Remove leading/trailing punctuation
kw = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', kw)
# Normalize whitespace
kw = ' '.join(kw.split())
return kw.strip()
def is_valid_keyword(kw: str, min_length: int = 3) -> bool:
"""Check if a keyword is valid (not stopword, not too short, not junk)."""
kw_lower = kw.lower()
# Too short
if len(kw) < min_length:
return False
# All stopwords
words = kw_lower.split()
if all(w in ALL_STOPWORDS for w in words):
return False
# Contains only numbers or special chars
if re.match(r'^[\d\s\-\.\,]+$', kw):
return False
# Looks like navigation/UI text
ui_patterns = [
r'^\d+\s*(items?|products?|results?)',
r'^(skip|back|next|prev|home|menu)',
r'(copyright|reserved|privacy|terms)',
r'^\d+\s*$', # Just numbers
r'^[\W_]+$', # Just punctuation
]
for pattern in ui_patterns:
if re.search(pattern, kw_lower):
return False
return True
def calculate_keyword_density(keywords: List[Dict], total_words: int) -> List[Dict]:
"""Calculate keyword density percentage."""
for kw in keywords:
word_count = len(kw['kw'].split())
frequency = kw['count']
# Density = (keyword frequency ร word count) / total words ร 100
density = (frequency * word_count / total_words * 100) if total_words > 0 else 0
kw['density'] = round(density, 2)
return keywords
def classify_keywords(keywords: List[Dict]) -> Dict[str, List[Dict]]:
"""Classify keywords into primary, secondary, and long-tail."""
if not keywords:
return {'primary': [], 'secondary': [], 'long_tail': []}
# Sort by count
sorted_kws = sorted(keywords, key=lambda x: x['count'], reverse=True)
# Primary: top 20% by frequency
primary_count = max(3, len(sorted_kws) // 5)
primary = sorted_kws[:primary_count]
# Secondary: next 30%
secondary_count = max(5, len(sorted_kws) * 3 // 10)
secondary = sorted_kws[primary_count:primary_count + secondary_count]
# Long-tail: rest
long_tail = sorted_kws[primary_count + secondary_count:]
return {
'primary': primary,
'secondary': secondary,
'long_tail': long_tail
}
def cluster_by_topic(keywords: List[Dict]) -> Dict[str, List[Dict]]:
"""Group keywords by semantic topic."""
# Simple topic clustering based on common words
clusters = defaultdict(list)
for kw_obj in keywords:
kw = kw_obj['kw'].lower()
# SEO-related
if any(term in kw for term in ['seo', 'search', 'ranking', 'optimization', 'ู
ุญุฑู', 'ุจุญุซ']):
clusters['SEO & Search'].append(kw_obj)
# E-commerce
elif any(term in kw for term in ['shop', 'store', 'product', 'price', 'buy', 'ู
ุชุฌุฑ', 'ู
ูุชุฌ']):
clusters['E-commerce'].append(kw_obj)
# Content/Blog
elif any(term in kw for term in ['blog', 'article', 'post', 'content', 'ู
ูุงู', 'ู
ุญุชูู']):
clusters['Content'].append(kw_obj)
# Location
elif any(term in kw for term in ['city', 'location', 'address', 'ู
ุฏููุฉ', 'ู
ููุน', 'ุนููุงู']):
clusters['Location'].append(kw_obj)
# Brand/Product names
elif kw_obj['count'] >= 5 and len(kw.split()) <= 2:
clusters['Brand/Product'].append(kw_obj)
# General
else:
clusters['General'].append(kw_obj)
# Remove empty clusters
return {k: v for k, v in clusters.items() if v}
def calculate_coverage_score(keywords: List[Dict], expected_keywords: List[str]) -> Dict:
"""Calculate topic coverage score based on expected keywords."""
found = set(kw['kw'].lower() for kw in keywords)
expected = set(k.lower() for k in expected_keywords)
matched = found & expected
missing = expected - found
coverage = (len(matched) / len(expected) * 100) if expected else 0
return {
'score': round(coverage, 1),
'matched': list(matched),
'missing': list(missing),
'total_expected': len(expected),
'total_found': len(matched)
}
def analyze_keywords(keywords: List[Dict], total_words: int = 0,
expected_keywords: List[str] = None) -> Dict:
"""
Comprehensive keyword analysis with classification, clustering, and metrics.
Args:
keywords: List of keyword dicts with 'kw' and 'count'
total_words: Total word count on page (for density calculation)
expected_keywords: Expected keywords for coverage analysis
Returns:
Complete analytics report
"""
# Clean and filter keywords
cleaned = []
for kw_obj in keywords:
kw = clean_keyword(kw_obj['kw'])
if is_valid_keyword(kw):
cleaned.append({**kw_obj, 'kw': kw})
# Remove duplicates (case-insensitive)
seen = {}
unique = []
for kw_obj in cleaned:
kw_lower = kw_obj['kw'].lower()
if kw_lower not in seen:
seen[kw_lower] = kw_obj
unique.append(kw_obj)
else:
# Merge counts if duplicate
seen[kw_lower]['count'] += kw_obj['count']
# Re-sort by count
unique = sorted(unique, key=lambda x: x['count'], reverse=True)
# Calculate density
if total_words > 0:
unique = calculate_keyword_density(unique, total_words)
# Classify keywords
classification = classify_keywords(unique)
# Cluster by topic
clusters = cluster_by_topic(unique)
# Calculate coverage if expected keywords provided
coverage = None
if expected_keywords:
coverage = calculate_coverage_score(unique, expected_keywords)
# Calculate metrics
total_keywords = len(unique)
avg_frequency = sum(k['count'] for k in unique) / total_keywords if total_keywords > 0 else 0
# Top keywords (top 10)
top_keywords = unique[:10]
return {
'summary': {
'total_keywords': total_keywords,
'total_words': total_words,
'avg_frequency': round(avg_frequency, 1),
'primary_keywords': len(classification['primary']),
'secondary_keywords': len(classification['secondary']),
'long_tail_keywords': len(classification['long_tail'])
},
'top_keywords': top_keywords,
'classification': classification,
'clusters': clusters,
'coverage': coverage,
'all_keywords': unique
}
def format_analytics_report(analytics: Dict) -> str:
"""Format analytics as a readable text report."""
lines = []
lines.append("=" * 80)
lines.append("KEYWORD ANALYTICS REPORT")
lines.append("=" * 80)
# Summary
summary = analytics['summary']
lines.append("\n๐ SUMMARY")
lines.append("-" * 80)
lines.append(f"Total Keywords Found: {summary['total_keywords']}")
lines.append(f"Total Words on Page: {summary['total_words']}")
lines.append(f"Average Keyword Frequency: {summary['avg_frequency']}")
lines.append(f"Primary Keywords: {summary['primary_keywords']}")
lines.append(f"Secondary Keywords: {summary['secondary_keywords']}")
lines.append(f"Long-tail Keywords: {summary['long_tail_keywords']}")
# Top Keywords
lines.append("\n๐ TOP KEYWORDS")
lines.append("-" * 80)
lines.append(f"{'Keyword':<40} {'Frequency':<12} {'Density':<10}")
lines.append("-" * 80)
for kw in analytics['top_keywords']:
density = f"{kw.get('density', 0):.2f}%" if 'density' in kw else 'N/A'
lines.append(f"{kw['kw']:<40} {kw['count']:<12} {density:<10}")
# Classification
lines.append("\n๐ KEYWORD CLASSIFICATION")
lines.append("-" * 80)
classification = analytics['classification']
lines.append("\n1๏ธโฃ PRIMARY KEYWORDS (High Priority)")
for kw in classification['primary'][:5]:
lines.append(f" โข {kw['kw']} ({kw['count']})")
lines.append("\n2๏ธโฃ SECONDARY KEYWORDS (Medium Priority)")
for kw in classification['secondary'][:5]:
lines.append(f" โข {kw['kw']} ({kw['count']})")
lines.append("\n3๏ธโฃ LONG-TAIL KEYWORDS (Low Priority)")
for kw in classification['long_tail'][:5]:
lines.append(f" โข {kw['kw']} ({kw['count']})")
# Topic Clusters
lines.append("\n๐ฏ TOPIC CLUSTERS")
lines.append("-" * 80)
for topic, kws in analytics['clusters'].items():
lines.append(f"\n{topic} ({len(kws)} keywords)")
for kw in kws[:3]:
lines.append(f" โข {kw['kw']} ({kw['count']})")
# Coverage
if analytics.get('coverage'):
coverage = analytics['coverage']
lines.append("\n๐ TOPIC COVERAGE")
lines.append("-" * 80)
lines.append(f"Coverage Score: {coverage['score']}%")
lines.append(f"Matched Keywords: {coverage['total_found']}/{coverage['total_expected']}")
if coverage['missing']:
lines.append("\nMissing Keywords:")
for kw in coverage['missing'][:5]:
lines.append(f" โ {kw}")
lines.append("\n" + "=" * 80)
return "\n".join(lines)
|