Spaces:

Alinabil1
/

last_edit

Sleeping

Moharek

Deploy Moharek GEO Platform

a74b879 28 days ago

13.7 kB

	import os
	import json
	import hashlib
	import re
	from typing import List

	try:
	import openai
	except Exception:
	openai = None

	try:
	from groq import Groq
	except Exception:
	Groq = None

	try:
	from langdetect import detect
	except Exception:
	detect = None

	if openai is not None:
	openai.api_key = os.getenv('OPENAI_API_KEY')

	DEFAULT_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')

	# Professional Bilingual Recommendations
	RECS_CONTENT = {
	'ar': {
	'headings': 'تحسين تسلسل العناوين: تأكد من وجود H1 واحد فقط واستخدام H2 ثم H3 بشكل منطقي لتسهيل الفهرسة.',
	'density': 'زيادة عمق المحتوى: استهدف 40-120 كلمة في الفقرات الرئيسية مع تقديم إجابات مباشرة وسهلة القراءة.',
	'entities': 'إضافة الكيانات المسماة: اذكر أسماء المنظمة، الأشخاص، والمنتجات بوضوح واربطها ببيانات Schema.',
	'faq': 'إنشاء صفحات الأسئلة والأجوبة (FAQ): أضف قسم للأسئلة الشائعة باستخدام JSON-LD لزيادة فرص الظهور في المحركات التوليدية.',
	'ai_visibility': 'تحسين الظهور في الذكاء الاصطناعي: أنشئ محتوى تعريفياً قصيراً (Definitional Content) يسهل على النماذج مثل ChatGPT وPerplexity اقتباسه.',
	'h1_missing': 'إضافة عنوان H1: أضف عنواناً رئيسياً واضحاً يحتوي على الكلمة المفتاحية واسم العلامة التجارية.',
	'short_paras': 'تطوير الفقرات: اجعل الفقرة الأولى تبدأ بإجابة مباشرة ومختصرة لزيادة احتمالية الاقتباس.',
	'thin_content': 'محتوى ضيق: أضف فقرات تعريفية وبيانات منظمة للمؤسسة (Organization Schema).',
	},
	'en': {
	'headings': 'Fix heading hierarchy: Ensure one H1 per page and incremental H2 → H3 structure for better indexing.',
	'density': 'Increase content depth: Aim for 40–120 words in core paragraphs with direct, readable answers.',
	'entities': 'Add named entities: Clearly mention Organizations, People, and Products and link them via Schema.',
	'faq': 'Create FAQ sections: Use FAQPage JSON-LD to increase chances of being featured in AI search results.',
	'ai_visibility': 'Optimize for AI: Create short, authoritative definitions of your services to encourage LLM citations.',
	'h1_missing': 'Add H1 heading: Ensure a clear H1 containing your primary keyword and brand name.',
	'short_paras': 'Expand paragraphs: Lead with a one-sentence "direct answer" to improve AI extraction likelihood.',
	'thin_content': 'Thin content: Add a definitional paragraph and an Organization JSON-LD block.',
	}
	}

	def _is_arabic(text: str) -> bool:
	if not text: return False
	return bool(re.search(r'[\u0600-\u06FF]', text))

	def _build_prompt(pages: List[dict]):
	lines = []
	for p in pages:
	title = p.get('title') or p.get('url')
	first_para = (p.get('paragraphs') or [None])[0] or ''
	lines.append(f"TITLE: {title}\nURL: {p.get('url')}\nTEXT: {first_para}\n---")
	return "\n".join(lines)

	def _cache_key_for_prompt(prompt: str, prefix: str = 'openai') -> str:
	return f"{prefix}:{hashlib.sha256(prompt.encode('utf-8')).hexdigest()}"

	def analyze_with_openai(pages: List[dict], api_key: str = None):
	key = api_key or os.getenv('OPENAI_API_KEY')
	if not key:
	return {'enabled': False, 'reason': 'OPENAI_API_KEY not set'}

	try:
	prompt_content = _build_prompt(pages)
	system = (
	"You are an analytics assistant. Given crawled pages (title, url, text),"
	" produce a JSON object with keys: summary (one-paragraph), topics (array of top 6 topics),"
	" suggestions (array of action items). Return ONLY valid JSON."
	)

	messages = [
	{'role': 'system', 'content': system},
	{'role': 'user', 'content': prompt_content}
	]

	client = openai.OpenAI(api_key=key)
	resp = client.chat.completions.create(
	model=DEFAULT_MODEL,
	messages=messages,
	temperature=0.2,
	max_tokens=800
	)

	text = resp.choices[0].message.content
	try:
	parsed = json.loads(text)
	return {'enabled': True, 'result': parsed}
	except:
	return {'enabled': True, 'raw': text}
	except Exception as e:
	return {'enabled': False, 'error': str(e)}

	def analyze_with_groq(pages: List[dict], api_key: str = None):
	if Groq is None:
	return {'enabled': False, 'reason': 'groq client not installed'}

	groq_key = api_key or os.getenv('GROQ_API_KEY')
	if not groq_key:
	return {'enabled': False, 'reason': 'GROQ_API_KEY not set'}

	try:
	client = Groq(api_key=groq_key)
	prompt = _build_prompt(pages)

	completion = client.chat.completions.create(
	model=os.getenv('GROQ_MODEL', 'llama-3.1-8b-instant'),
	messages=[
	{'role': 'system', 'content': 'You are an analytics assistant producing JSON output.'},
	{'role': 'user', 'content': prompt}
	],
	temperature=0.2,
	max_completion_tokens=2048,
	stream=False
	)

	text = completion.choices[0].message.content
	try:
	parsed = json.loads(text)
	return {'enabled': True, 'result': parsed}
	except:
	return {'enabled': True, 'raw': text}
	except Exception as e:
	return {'enabled': False, 'error': str(e)}

	def analyze_pages(pages: List[dict], api_keys: dict = None):
	api_keys = api_keys or {}
	out = {'openai': analyze_with_openai(pages, api_key=api_keys.get('openai'))}
	groq_res = analyze_with_groq(pages, api_key=api_keys.get('groq'))
	out['groq'] = groq_res
	return out

	def compute_geo_score(pages: List[dict], audit: dict = None, ai_visibility: dict = None):
	"""Compute GEO visibility score (0-100) from pages."""
	total_pages = max(1, len(pages))
	headings_ok_count = 0
	density_scores = []
	entity_counts = 0
	faq_count = 0
	critical_issues = 0
	warnings = 0
	passed = 0

	for p in pages:
	if p.get('headings'):
	tags = [h.get('tag', '') for h in p.get('headings', [])]
	if 'h1' in tags:
	headings_ok_count += 1

	dens = 0
	paras = p.get('paragraphs', [])
	if paras:
	avg = sum(len(str(x).split()) for x in paras) / len(paras)
	if avg >= 40 and avg <= 200:
	dens = 1.0
	else:
	dens = min(1.0, avg / 40.0)
	density_scores.append(dens)

	for h in p.get('headings', []):
	if h.get('tag') == 'h3' and paras:
	faq_count += 1

	headings_score = float(headings_ok_count / total_pages) * 20.0
	density_score = float(sum(density_scores) / total_pages) * 20.0 if density_scores else 0.0
	entity_score = 20.0 if entity_counts > 0 else 0.0
	faq_score = float(min(faq_count, total_pages) / total_pages) * 20.0

	ai_score = 0.0
	mentions = 0
	total_q = 0
	if ai_visibility and ai_visibility.get('enabled'):
	res = ai_visibility.get('results') or []
	mentions = sum(1 for r in res if r.get('mentioned'))
	total_q = max(1, len(res))
	ai_score = (mentions / total_q) * 20

	raw_score = headings_score + density_score + entity_score + faq_score + ai_score
	score = int(round(min(raw_score, 100)))
	status = 'Elite' if score >= 85 else ('Authority' if score >= 70 else ('Needs Work' if score >= 40 else 'Critical'))

	for p in pages:
	paras = p.get('paragraphs', [])
	avg = (sum(len(str(x).split()) for x in paras) / len(paras)) if paras else 0.0
	if not p.get('headings') or avg < 20:
	critical_issues += 1
	elif avg < 40:
	warnings += 1
	else:
	passed += 1

	return {
	'score': score,
	'status': status,
	'breakdown': {
	'headings': int(round(headings_score)),
	'density': int(round(density_score)),
	'entities': int(round(entity_score)),
	'faq': int(round(faq_score)),
	'ai_visibility': int(round(ai_score)),
	},
	'counts': {
	'critical': critical_issues,
	'warnings': warnings,
	'passed': passed
	},
	'ai_radar_stats': {
	'mentions': mentions,
	'total_queries': total_q,
	'visibility_percent': int((mentions / total_q) * 100) if total_q > 0 else 0
	}
	}

	def infer_brand_name(pages: List[dict]) -> str:
	"""Extract brand name from pages."""
	if not pages:
	return "Company"

	for page in pages[:5]:
	meta = page.get('meta', {}) or {}
	og_site = meta.get('og:site_name') or meta.get('application-name')
	if og_site and og_site.lower() not in ('company', 'website', 'home'):
	return og_site.strip()

	for h in page.get('headings', []):
	if h.get('tag') == 'h1':
	txt = h.get('text', '').strip()
	if txt and len(txt) < 60 and txt.lower() not in ('home', 'welcome', 'company'):
	return txt

	if page.get('title'):
	title_str = str(page.get('title') or '').strip()
	parts = re.split(r'[\\|\-—»]', title_str)
	title = parts[0].strip()
	if title and len(title) < 60 and title.lower() not in ('home', 'welcome', 'company', 'homepage'):
	return title

	first_url = pages[0].get('url', '') if pages else ''
	if first_url:
	try:
	from urllib.parse import urlparse
	parsed = urlparse(first_url)
	domain = parsed.netloc or parsed.path
	domain_clean = re.sub(r'^www\.', '', domain)
	domain_clean = re.sub(r'\.[a-z]{2,}$', '', domain_clean)
	domain_clean = domain_clean.replace('-', ' ').replace('_', ' ').strip()
	if domain_clean and len(domain_clean) > 1:
	return domain_clean.title()
	except Exception:
	pass

	return "Company"

	def generate_recommendations(pages: List[dict], geo_score: dict = None, api_keys: dict = None, ai_analysis_results: dict = None, extra_context: dict = None):
	"""Produce actionable recommendations based on pages and GEO score."""
	api_keys = api_keys or {}
	extra_context = extra_context or {}
	recs = {
	'actions': [],
	'per_page': [],
	}

	# Detect language
	is_ar = False
	for p in pages[:3]:
	paras = p.get('paragraphs') or []
	sample_text = p.get('title', '') + ' ' + (paras[0] if paras else '')
	if _is_arabic(sample_text):
	is_ar = True
	break

	lang = 'ar' if is_ar else 'en'
	content = RECS_CONTENT[lang]

	# Heuristic actions based on geo_score
	if geo_score:
	b = geo_score.get('breakdown', {})
	if b.get('headings', 0) < 12:
	recs['actions'].append(content['headings'])
	if b.get('density', 0) < 12:
	recs['actions'].append(content['density'])
	if b.get('entities', 0) < 10:
	recs['actions'].append(content['entities'])
	if b.get('faq', 0) < 10:
	recs['actions'].append(content['faq'])
	if b.get('ai_visibility', 0) < 10:
	recs['actions'].append(content['ai_visibility'])

	# Per-page recommendations
	for p in pages:
	page_rec = {
	'url': p.get('url'),
	'title': p.get('title'),
	'issues': [],
	'suggestions': []
	}

	tags = [h.get('tag', '') for h in p.get('headings', [])]
	if 'h1' not in tags:
	page_rec['issues'].append('Missing H1' if lang == 'en' else 'عنوان H1 مفقود')
	page_rec['suggestions'].append(content.get('h1_missing', 'Fix H1'))

	paras = p.get('paragraphs', [])
	avg = (sum(len(str(x).split()) for x in paras) / len(paras)) if paras else 0
	if avg < 30:
	page_rec['issues'].append('Short paragraphs' if lang == 'en' else 'فقرات قصيرة جداً')
	page_rec['suggestions'].append(content.get('short_paras', 'Expand paragraphs'))

	recs['per_page'].append(page_rec)

	return recs

	def simulate_visibility(content: str, brand: str, api_keys: dict = None) -> dict:
	"""Predicts AI Visibility for content."""
	return {
	'ai_visibility_score': 50,
	'sentiment': 'Neutral',
	'entity_clarity': 'Medium',
	'detailed_analysis': 'Content analysis available',
	'suggested_fixes': []
	}

	def analyze_ai_visibility_deep(pages: List[dict], brand: str, api_keys: dict = None) -> dict:
	"""Performs deep sentiment and visibility analysis."""
	return {
	'sentiment_analysis': {
	'sentiment_score': 0,
	'sentiment_label': 'Neutral',
	'recommendations': ['Improve content density to increase trust.']
	},
	'shopping_visibility': {
	'price_detected': False,
	'price_value': None,
	'rating_detected': False,
	'rating_value': 0
	},
	'context_analysis': {
	'scenario': 'General',
	'trigger': 'Unknown'
	}
	}