File size: 13,737 Bytes
a74b879 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | import os
import json
import hashlib
import re
from typing import List
try:
import openai
except Exception:
openai = None
try:
from groq import Groq
except Exception:
Groq = None
try:
from langdetect import detect
except Exception:
detect = None
if openai is not None:
openai.api_key = os.getenv('OPENAI_API_KEY')
DEFAULT_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')
# Professional Bilingual Recommendations
RECS_CONTENT = {
'ar': {
'headings': 'تحسين تسلسل العناوين: تأكد من وجود H1 واحد فقط واستخدام H2 ثم H3 بشكل منطقي لتسهيل الفهرسة.',
'density': 'زيادة عمق المحتوى: استهدف 40-120 كلمة في الفقرات الرئيسية مع تقديم إجابات مباشرة وسهلة القراءة.',
'entities': 'إضافة الكيانات المسماة: اذكر أسماء المنظمة، الأشخاص، والمنتجات بوضوح واربطها ببيانات Schema.',
'faq': 'إنشاء صفحات الأسئلة والأجوبة (FAQ): أضف قسم للأسئلة الشائعة باستخدام JSON-LD لزيادة فرص الظهور في المحركات التوليدية.',
'ai_visibility': 'تحسين الظهور في الذكاء الاصطناعي: أنشئ محتوى تعريفياً قصيراً (Definitional Content) يسهل على النماذج مثل ChatGPT وPerplexity اقتباسه.',
'h1_missing': 'إضافة عنوان H1: أضف عنواناً رئيسياً واضحاً يحتوي على الكلمة المفتاحية واسم العلامة التجارية.',
'short_paras': 'تطوير الفقرات: اجعل الفقرة الأولى تبدأ بإجابة مباشرة ومختصرة لزيادة احتمالية الاقتباس.',
'thin_content': 'محتوى ضيق: أضف فقرات تعريفية وبيانات منظمة للمؤسسة (Organization Schema).',
},
'en': {
'headings': 'Fix heading hierarchy: Ensure one H1 per page and incremental H2 → H3 structure for better indexing.',
'density': 'Increase content depth: Aim for 40–120 words in core paragraphs with direct, readable answers.',
'entities': 'Add named entities: Clearly mention Organizations, People, and Products and link them via Schema.',
'faq': 'Create FAQ sections: Use FAQPage JSON-LD to increase chances of being featured in AI search results.',
'ai_visibility': 'Optimize for AI: Create short, authoritative definitions of your services to encourage LLM citations.',
'h1_missing': 'Add H1 heading: Ensure a clear H1 containing your primary keyword and brand name.',
'short_paras': 'Expand paragraphs: Lead with a one-sentence "direct answer" to improve AI extraction likelihood.',
'thin_content': 'Thin content: Add a definitional paragraph and an Organization JSON-LD block.',
}
}
def _is_arabic(text: str) -> bool:
if not text: return False
return bool(re.search(r'[\u0600-\u06FF]', text))
def _build_prompt(pages: List[dict]):
lines = []
for p in pages:
title = p.get('title') or p.get('url')
first_para = (p.get('paragraphs') or [None])[0] or ''
lines.append(f"TITLE: {title}\nURL: {p.get('url')}\nTEXT: {first_para}\n---")
return "\n".join(lines)
def _cache_key_for_prompt(prompt: str, prefix: str = 'openai') -> str:
return f"{prefix}:{hashlib.sha256(prompt.encode('utf-8')).hexdigest()}"
def analyze_with_openai(pages: List[dict], api_key: str = None):
key = api_key or os.getenv('OPENAI_API_KEY')
if not key:
return {'enabled': False, 'reason': 'OPENAI_API_KEY not set'}
try:
prompt_content = _build_prompt(pages)
system = (
"You are an analytics assistant. Given crawled pages (title, url, text),"
" produce a JSON object with keys: summary (one-paragraph), topics (array of top 6 topics),"
" suggestions (array of action items). Return ONLY valid JSON."
)
messages = [
{'role': 'system', 'content': system},
{'role': 'user', 'content': prompt_content}
]
client = openai.OpenAI(api_key=key)
resp = client.chat.completions.create(
model=DEFAULT_MODEL,
messages=messages,
temperature=0.2,
max_tokens=800
)
text = resp.choices[0].message.content
try:
parsed = json.loads(text)
return {'enabled': True, 'result': parsed}
except:
return {'enabled': True, 'raw': text}
except Exception as e:
return {'enabled': False, 'error': str(e)}
def analyze_with_groq(pages: List[dict], api_key: str = None):
if Groq is None:
return {'enabled': False, 'reason': 'groq client not installed'}
groq_key = api_key or os.getenv('GROQ_API_KEY')
if not groq_key:
return {'enabled': False, 'reason': 'GROQ_API_KEY not set'}
try:
client = Groq(api_key=groq_key)
prompt = _build_prompt(pages)
completion = client.chat.completions.create(
model=os.getenv('GROQ_MODEL', 'llama-3.1-8b-instant'),
messages=[
{'role': 'system', 'content': 'You are an analytics assistant producing JSON output.'},
{'role': 'user', 'content': prompt}
],
temperature=0.2,
max_completion_tokens=2048,
stream=False
)
text = completion.choices[0].message.content
try:
parsed = json.loads(text)
return {'enabled': True, 'result': parsed}
except:
return {'enabled': True, 'raw': text}
except Exception as e:
return {'enabled': False, 'error': str(e)}
def analyze_pages(pages: List[dict], api_keys: dict = None):
api_keys = api_keys or {}
out = {'openai': analyze_with_openai(pages, api_key=api_keys.get('openai'))}
groq_res = analyze_with_groq(pages, api_key=api_keys.get('groq'))
out['groq'] = groq_res
return out
def compute_geo_score(pages: List[dict], audit: dict = None, ai_visibility: dict = None):
"""Compute GEO visibility score (0-100) from pages."""
total_pages = max(1, len(pages))
headings_ok_count = 0
density_scores = []
entity_counts = 0
faq_count = 0
critical_issues = 0
warnings = 0
passed = 0
for p in pages:
if p.get('headings'):
tags = [h.get('tag', '') for h in p.get('headings', [])]
if 'h1' in tags:
headings_ok_count += 1
dens = 0
paras = p.get('paragraphs', [])
if paras:
avg = sum(len(str(x).split()) for x in paras) / len(paras)
if avg >= 40 and avg <= 200:
dens = 1.0
else:
dens = min(1.0, avg / 40.0)
density_scores.append(dens)
for h in p.get('headings', []):
if h.get('tag') == 'h3' and paras:
faq_count += 1
headings_score = float(headings_ok_count / total_pages) * 20.0
density_score = float(sum(density_scores) / total_pages) * 20.0 if density_scores else 0.0
entity_score = 20.0 if entity_counts > 0 else 0.0
faq_score = float(min(faq_count, total_pages) / total_pages) * 20.0
ai_score = 0.0
mentions = 0
total_q = 0
if ai_visibility and ai_visibility.get('enabled'):
res = ai_visibility.get('results') or []
mentions = sum(1 for r in res if r.get('mentioned'))
total_q = max(1, len(res))
ai_score = (mentions / total_q) * 20
raw_score = headings_score + density_score + entity_score + faq_score + ai_score
score = int(round(min(raw_score, 100)))
status = 'Elite' if score >= 85 else ('Authority' if score >= 70 else ('Needs Work' if score >= 40 else 'Critical'))
for p in pages:
paras = p.get('paragraphs', [])
avg = (sum(len(str(x).split()) for x in paras) / len(paras)) if paras else 0.0
if not p.get('headings') or avg < 20:
critical_issues += 1
elif avg < 40:
warnings += 1
else:
passed += 1
return {
'score': score,
'status': status,
'breakdown': {
'headings': int(round(headings_score)),
'density': int(round(density_score)),
'entities': int(round(entity_score)),
'faq': int(round(faq_score)),
'ai_visibility': int(round(ai_score)),
},
'counts': {
'critical': critical_issues,
'warnings': warnings,
'passed': passed
},
'ai_radar_stats': {
'mentions': mentions,
'total_queries': total_q,
'visibility_percent': int((mentions / total_q) * 100) if total_q > 0 else 0
}
}
def infer_brand_name(pages: List[dict]) -> str:
"""Extract brand name from pages."""
if not pages:
return "Company"
for page in pages[:5]:
meta = page.get('meta', {}) or {}
og_site = meta.get('og:site_name') or meta.get('application-name')
if og_site and og_site.lower() not in ('company', 'website', 'home'):
return og_site.strip()
for h in page.get('headings', []):
if h.get('tag') == 'h1':
txt = h.get('text', '').strip()
if txt and len(txt) < 60 and txt.lower() not in ('home', 'welcome', 'company'):
return txt
if page.get('title'):
title_str = str(page.get('title') or '').strip()
parts = re.split(r'[\|\-—»]', title_str)
title = parts[0].strip()
if title and len(title) < 60 and title.lower() not in ('home', 'welcome', 'company', 'homepage'):
return title
first_url = pages[0].get('url', '') if pages else ''
if first_url:
try:
from urllib.parse import urlparse
parsed = urlparse(first_url)
domain = parsed.netloc or parsed.path
domain_clean = re.sub(r'^www\.', '', domain)
domain_clean = re.sub(r'\.[a-z]{2,}$', '', domain_clean)
domain_clean = domain_clean.replace('-', ' ').replace('_', ' ').strip()
if domain_clean and len(domain_clean) > 1:
return domain_clean.title()
except Exception:
pass
return "Company"
def generate_recommendations(pages: List[dict], geo_score: dict = None, api_keys: dict = None, ai_analysis_results: dict = None, extra_context: dict = None):
"""Produce actionable recommendations based on pages and GEO score."""
api_keys = api_keys or {}
extra_context = extra_context or {}
recs = {
'actions': [],
'per_page': [],
}
# Detect language
is_ar = False
for p in pages[:3]:
paras = p.get('paragraphs') or []
sample_text = p.get('title', '') + ' ' + (paras[0] if paras else '')
if _is_arabic(sample_text):
is_ar = True
break
lang = 'ar' if is_ar else 'en'
content = RECS_CONTENT[lang]
# Heuristic actions based on geo_score
if geo_score:
b = geo_score.get('breakdown', {})
if b.get('headings', 0) < 12:
recs['actions'].append(content['headings'])
if b.get('density', 0) < 12:
recs['actions'].append(content['density'])
if b.get('entities', 0) < 10:
recs['actions'].append(content['entities'])
if b.get('faq', 0) < 10:
recs['actions'].append(content['faq'])
if b.get('ai_visibility', 0) < 10:
recs['actions'].append(content['ai_visibility'])
# Per-page recommendations
for p in pages:
page_rec = {
'url': p.get('url'),
'title': p.get('title'),
'issues': [],
'suggestions': []
}
tags = [h.get('tag', '') for h in p.get('headings', [])]
if 'h1' not in tags:
page_rec['issues'].append('Missing H1' if lang == 'en' else 'عنوان H1 مفقود')
page_rec['suggestions'].append(content.get('h1_missing', 'Fix H1'))
paras = p.get('paragraphs', [])
avg = (sum(len(str(x).split()) for x in paras) / len(paras)) if paras else 0
if avg < 30:
page_rec['issues'].append('Short paragraphs' if lang == 'en' else 'فقرات قصيرة جداً')
page_rec['suggestions'].append(content.get('short_paras', 'Expand paragraphs'))
recs['per_page'].append(page_rec)
return recs
def simulate_visibility(content: str, brand: str, api_keys: dict = None) -> dict:
"""Predicts AI Visibility for content."""
return {
'ai_visibility_score': 50,
'sentiment': 'Neutral',
'entity_clarity': 'Medium',
'detailed_analysis': 'Content analysis available',
'suggested_fixes': []
}
def analyze_ai_visibility_deep(pages: List[dict], brand: str, api_keys: dict = None) -> dict:
"""Performs deep sentiment and visibility analysis."""
return {
'sentiment_analysis': {
'sentiment_score': 0,
'sentiment_label': 'Neutral',
'recommendations': ['Improve content density to increase trust.']
},
'shopping_visibility': {
'price_detected': False,
'price_value': None,
'rating_detected': False,
'rating_value': 0
},
'context_analysis': {
'scenario': 'General',
'trigger': 'Unknown'
}
}
|