"""AI Content Engine — محرك المحتوى بالذكاء الاصطناعي
Supports: Groq (fast/free), OpenAI, Claude, Ollama.
2026 Standards: Grounded content, Entity graphs, Multi-schema, GEO local, Proof sections.
"""
import os
import json
import re
import requests
import sys
# ── LLM backends ──────────────────────────────────────────────────────────────
def _call_groq(prompt: str, api_key: str = None) -> str:
import requests
keys = [api_key.strip()] if api_key and api_key.strip() else []
for suffix in ['', '_2', '_3', '_4', '_5']:
k = os.getenv(f'GROQ_API_KEY{suffix}')
if k and k.strip() and k.strip() not in keys:
keys.append(k.strip())
if not keys:
raise RuntimeError('No GROQ_API_KEY found in .env or passed')
last_err = None
for key in keys:
try:
resp = requests.post(
"https://api.groq.com/openai/v1/chat/completions",
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
json={
"model": os.getenv('GROQ_MODEL', 'llama-3.3-70b-versatile'),
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.2,
"max_tokens": 3000
},
timeout=60
)
resp.raise_for_status()
return resp.json()["choices"][0]["message"]["content"]
except Exception as e:
last_err = e
continue
raise RuntimeError(f"All GROQ keys failed. Last error: {last_err}")
def _call_openai(prompt: str, api_key: str = None) -> str:
import requests
keys = [api_key.strip()] if api_key and api_key.strip() else []
for suffix in ['', '_2', '_3', '_4', '_5']:
k = os.getenv(f'OPENAI_API_KEY{suffix}')
if k and k.strip() and k.strip() not in keys:
keys.append(k.strip())
if not keys:
raise RuntimeError('No OPENAI_API_KEY found in .env or passed')
last_err = None
for key in keys:
try:
resp = requests.post(
"https://api.openai.com/v1/chat/completions",
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
json={
"model": os.getenv('OPENAI_MODEL', 'gpt-4o-mini'),
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.2,
"max_tokens": 3000
},
timeout=60
)
resp.raise_for_status()
return resp.json()["choices"][0]["message"]["content"]
except Exception as e:
last_err = e
continue
raise RuntimeError(f"All OPENAI keys failed. Last error: {last_err}")
def _call_claude(prompt: str, api_key: str = None) -> str:
key = api_key or os.getenv('CLAUDE_API_KEY') or os.getenv('ANTHROPIC_API_KEY')
if not key:
raise RuntimeError('CLAUDE_API_KEY not set')
resp = requests.post(
'https://api.anthropic.com/v1/messages',
headers={'x-api-key': key, 'anthropic-version': '2023-06-01', 'content-type': 'application/json'},
json={'model': os.getenv('CLAUDE_MODEL', 'claude-3-5-sonnet-20241022'),
'max_tokens': 3000, 'messages': [{'role': 'user', 'content': prompt}]},
timeout=60
)
resp.raise_for_status()
return resp.json()['content'][0]['text']
def _call_ollama(prompt: str, model: str = None) -> str:
host = os.getenv('OLLAMA_HOST', 'http://localhost:11434')
model = model or os.getenv('OLLAMA_MODEL', 'llama3')
resp = requests.post(f"{host}/api/chat", json={
'model': model,
'messages': [{'role': 'user', 'content': prompt}],
'stream': False
}, timeout=120)
resp.raise_for_status()
return resp.json()['message']['content']
def _call_demo(prompt: str) -> str:
"""Honest demo — shows the STRUCTURE of what real output looks like, clearly marked as demo."""
prompt_lower = prompt.lower()
is_arabic = 'arabic' in prompt_lower or bool(re.search(r'[\u0600-\u06FF]', prompt))
# Extract keyword
kw_match = re.search(r'(?:TARGET KEYWORD|keyword)[:\s]+([^\n]+)', prompt, re.IGNORECASE)
keyword = kw_match.group(1).strip() if kw_match else 'your keyword'
# Extract brand — try multiple patterns
brand = 'YourBrand'
for pattern in [
r'Brand[:\s]+([^\n,]+)',
r'BRAND[:\s]+([^\n,]+)',
r'BRAND/SITE[:\s]+([^\n,]+)',
r'target site[:\s]+([^\n,]+)',
]:
m = re.search(pattern, prompt, re.IGNORECASE)
if m:
candidate = m.group(1).strip()
# Clean URL from brand
if ',' in candidate:
parts = [p.strip() for p in candidate.split(',')]
candidate = next((p for p in parts if not p.startswith('http')), parts[-1])
if candidate.startswith('http'):
candidate = candidate.split('//')[-1].split('/')[0].replace('www.', '')
if candidate and candidate != 'YourBrand':
brand = candidate
break
# Extract real page content if passed
content_match = re.search(r'Page Content[:\s]+(.*?)(?:\n\n|\Z)', prompt, re.DOTALL | re.IGNORECASE)
real_snippet = content_match.group(1).strip()[:300] if content_match else ''
if 'faq' in prompt_lower:
if is_arabic:
return json.dumps({"faqs": [
{"question": f"ما هي خدمات {brand}؟",
"answer": f"[DEMO] {brand} تقدم خدمات متخصصة في مجال {keyword}. لتوليد إجابات حقيقية مبنية على بيانات موقعك، أضف مفتاح Groq API في الإعدادات."},
{"question": f"كيف يساعد {brand} في تحسين الظهور؟",
"answer": f"[DEMO] من خلال استراتيجيات {keyword} المدعومة بالذكاء الاصطناعي. أضف Groq API للحصول على إجابات مبنية على بيانات زحف موقعك الفعلية."}
]}, ensure_ascii=False)
return json.dumps({"faqs": [
{"question": f"What does {brand} offer for {keyword}?",
"answer": f"[DEMO] {brand} provides specialized {keyword} services. Add a Groq API key in Settings to generate answers grounded in your actual crawled data."},
{"question": f"How does {brand} improve {keyword} rankings?",
"answer": "[DEMO] Through AI-powered strategies. Connect your API key to generate evidence-based answers from your site's real content."}
]})
if 'optimize' in prompt_lower or 'analyze' in prompt_lower or 'audit' in prompt_lower:
if is_arabic:
return json.dumps({
"score": 42,
"score_breakdown": {"direct_answer": 8, "entities": 12, "intent": 5, "proof": 7, "schema": 10},
"issues": [
"فشل في الاتصال بمحركات الذكاء الاصطناعي (أضف مفتاح API)",
"غياب الإجابة المباشرة (Direct Answer) القابلة للاقتباس",
"ضعف في كثافة الكيانات المرتبطة بالعلامة التجارية"
],
"suggestions": [
"أضف مفتاح Groq API في 'إعدادات النظام' لتفعيل التحليل الحقيقي",
"اربط بيانات الزحف من 'سجل الأبحاث' لضبط سياق الـ GEO",
"استخدم ميزة 'الهوية الذكية' أولاً لبناء أساس المعرفة"
],
"implemented_fixes": [
"تفعيل وضع المعاينة (Demo Mode) لهيكلة البيانات",
"تهيئة واجهة v2.0-ULTRA لاستقبال البيانات الحقيقية",
"فحص توافق مفاتيح API (لم يتم العثور على مفتاح)"
],
"optimized_content": f"# [وضع العرض] تحسين {keyword} لموقع {brand}\n\nهذا مجرد نموذج عرض لشكل النتائج. للحصول على محتوى محسّن حقيقي مبني على بيانات موقعك، يرجى إضافة مفتاح API صحيح في الإعدادات.",
"schema": "",
"backend": "demo"
}, ensure_ascii=False)
return json.dumps({
"score": 42,
"score_breakdown": {"direct_answer": 8, "entities": 12, "intent": 5, "proof": 7, "schema": 10},
"issues": ["⚠️ No API key connected (Demo Mode)", "Direct Answer missing or too generic", "Entity density below required threshold"],
"suggestions": ["Add a Groq API key in System Settings", "Connect Research data to enable grounding", "Run 'Smart Identity' module first"],
"implemented_fixes": [
"Initialized v2.0-ULTRA Premium Interface",
f"Mapped semantic requirements for {keyword}",
f"Validated site context for {brand}"
],
"optimized_content": f"# [DEMO] {keyword} Optimization for {brand}\n\nAdd your Groq or OpenAI API key in Settings to generate a real, grounded version of this content.",
"schema": "",
"backend": "demo"
})
# Article generation demo
if is_arabic:
return json.dumps({
"title": f"[DEMO] {brand}: دليل {keyword} — أضف Groq API للمحتوى الحقيقي",
"meta_description": f"[DEMO] وصف تعريفي لـ {brand} في مجال {keyword}. أضف مفتاح API للحصول على وصف حقيقي.",
"content": f"""# [وضع تجريبي — أضف Groq API للمحتوى الحقيقي]
## ما الذي ستحصل عليه بعد إضافة مفتاح API؟
### 1. الإجابة المباشرة (Direct Answer)
محتوى محدد وقابل للاقتباس من محركات الذكاء الاصطناعي، مبني على بيانات موقع {brand} الفعلية.
{f'بيانات من موقعك: {real_snippet}' if real_snippet else ''}
### 2. خريطة الكيانات (Entity Graph)
```
{brand} → (Organization/LocalBusiness)
├── provides → [{keyword}]
├── operates_in → [المدينة/الدولة]
├── competes_with → [المنافسون]
└── recognized_by → [Google, Bing, Perplexity]
```
### 3. طبقة GEO المحلية
- كلمات مفتاحية محلية: "{keyword} في الرياض"، "{keyword} السعودية"
- Google Maps integration
- LocalBusiness Schema
### 4. قسم الإثبات (Proof Section)
- أرقام وإحصائيات حقيقية من موقعك
- Case studies
- نتائج قابلة للقياس
### 5. Schema متكامل
Organization + LocalBusiness + Service + FAQ
---
لتفعيل المحتوى الحقيقي: أضف مفتاح Groq API في الإعدادات (مجاني على groq.com)""",
"faqs": [
{"question": f"[DEMO] ما هي خدمات {brand}؟",
"answer": "أضف Groq API للحصول على إجابات مبنية على بيانات موقعك الفعلية."}
],
"schema": "",
"implemented_fixes": ["DEMO MODE — add API key for real implementation"],
"backend": "demo"
}, ensure_ascii=False)
return json.dumps({
"title": f"[DEMO] {brand}: {keyword} Guide — Add Groq API for Real Content",
"meta_description": f"[DEMO] Add a Groq API key to generate content grounded in {brand}'s actual crawled data.",
"content": f"""# [DEMO MODE — Add Groq API for Real Content]
## What you'll get with a real API key:
### 1. Direct Answer (AI-Citable)
A specific, evidence-based 50-word answer about {brand} and {keyword}, built from your crawled pages.
{f'Your site data: {real_snippet}' if real_snippet else ''}
### 2. Entity Graph
```
{brand} → (Organization)
├── provides → [{keyword}]
├── operates_in → [City/Country]
├── competes_with → [Real competitors]
└── cited_by → [ChatGPT, Perplexity, Google SGE]
```
### 3. GEO Local Layer
- Local keywords: "{keyword} in [City]", "best {keyword} [Country]"
- Google Maps signals
- LocalBusiness Schema
### 4. Proof Section
- Real metrics from your site
- Case studies
- Measurable outcomes
### 5. Full Schema Stack
Organization + LocalBusiness + Service + FAQ
---
To activate: Add free Groq API key in Settings (groq.com)""",
"faqs": [
{"question": f"[DEMO] What does {brand} offer for {keyword}?",
"answer": "Add Groq API key to generate answers grounded in your actual site data."}
],
"schema": "",
"implemented_fixes": ["DEMO MODE — add API key for real implementation"],
"backend": "demo"
})
def _llm_call(prompt: str, prefer: str = 'groq', api_keys: dict = None) -> dict:
"""Try backends in order. Returns {text, backend, errors}.
Automatically detects quota issues and falls back to Demo mode."""
api_keys = {k: (v.strip() if v else None) for k, v in (api_keys or {}).items()}
# Always include 'demo' as ultimate fallback
order = [prefer] + [b for b in ['groq', 'openai', 'claude', 'ollama', 'demo'] if b != prefer]
errors = {}
for backend in order:
try:
text = None
if backend == 'groq':
text = _call_groq(prompt, api_keys.get('groq'))
elif backend == 'openai':
text = _call_openai(prompt, api_keys.get('openai'))
elif backend == 'claude':
text = _call_claude(prompt, api_keys.get('claude'))
elif backend == 'ollama':
text = _call_ollama(prompt)
elif backend == 'demo':
text = _call_demo(prompt)
try:
res = json.loads(text)
if isinstance(res, dict):
res['backend_errors'] = errors
text = json.dumps(res, ensure_ascii=False)
except: pass
return {'text': text, 'backend': 'demo', 'errors': errors}
if text:
# Basic check: if LLM returns a common quota error as text instead of throwing
low_text = text.lower()
if any(x in low_text for x in ['insufficient_quota', 'rate_limit', 'too many requests', 'error 429']):
raise RuntimeError(f"Quota error detected in response: {text[:50]}")
return {'text': text, 'backend': backend}
except Exception as e:
errors[backend] = str(e)
print(f"[ContentEngine] Backend {backend} failed: {e}")
continue
return {'text': _call_demo(prompt), 'backend': 'demo', 'errors': errors}
def _parse_json_from_text(text: str) -> dict:
"""Extract first JSON object from LLM response with robust fallback and repair.
Now handles truncated strings and malformed escapes."""
if not text or not text.strip():
raise ValueError('Empty LLM response')
text = text.strip()
# Strip common LLM prefixes
if "```json" in text.lower():
parts = re.split(r'```json', text, flags=re.IGNORECASE)
if len(parts) > 1: text = parts[1].split("```")[0]
elif "```" in text:
parts = text.split("```")
if len(parts) > 1: text = parts[1]
start = text.find('{')
end = text.rfind('}')
if start != -1 and end != -1 and end >= start:
block = text[start:end+1]
else:
block = text.strip()
# Repair level 1: Direct parse
try:
return json.loads(block, strict=False)
except Exception: pass
# Repair level 2: Common fixes (smart quotes, trailing commas)
try:
repaired = block.replace('“', '"').replace('”', '"')
repaired = re.sub(r',\s*([\]}])', r'\1', repaired)
# Fix unescaped newlines in content strings
repaired = re.sub(r'":\s*"(.*?)"', lambda m: '": "' + m.group(1).replace('\n', '\\n') + '"', repaired, flags=re.DOTALL)
return json.loads(repaired, strict=False)
except Exception: pass
# Repair level 3: Truncated JSON recovery (closing braces)
try:
if block.count('{') > block.count('}'):
temp = block
# If it ends with a comma or open field, strip it
temp = re.sub(r',\s*$', '', temp.strip())
temp = re.sub(r'"[^"]*$', '', temp) # strip half-finished key/val
while temp.count('{') > temp.count('}'):
temp += '}'
return json.loads(temp, strict=False)
except Exception: pass
# Log failure for diagnosis
print(f"[ContentEngine] JSON Parsing Failed. Block snippet:\n{text[:300]}...", file=sys.stderr)
raise ValueError('No JSON found in LLM response')
def _build_context_block(crawl_data: dict) -> str:
"""Build a grounded context block from real crawled data."""
if not crawl_data:
return ''
lines = ['=== GROUNDED DATA FROM CRAWLED SITE ===']
if crawl_data.get('org_name'):
lines.append(f'Brand Name: {crawl_data["org_name"]}')
if crawl_data.get('url'):
lines.append(f'Target Site URL: {crawl_data["url"]}')
if crawl_data.get('industry'):
lines.append(f'Industry: {crawl_data["industry"]}')
if crawl_data.get('keywords'):
kws = [k.get('kw', k) if isinstance(k, dict) else k for k in crawl_data['keywords'][:20]]
lines.append(f'Top Keywords: {", ".join(kws)}')
if crawl_data.get('headings'):
lines.append(f'Site Structure (H1-H3): {" | ".join(crawl_data["headings"][:15])}')
if crawl_data.get('page_content'):
# Pass more content for better grounding
lines.append(f'Key Page Content Sample:\n{crawl_data["page_content"][:2000]}')
if crawl_data.get('competitors'):
lines.append(f'Recognized Competitors: {", ".join(crawl_data["competitors"][:8])}')
if crawl_data.get('geo_score'):
gs = crawl_data['geo_score']
lines.append(f'Current GEO Score: {gs.get("score", 0)}% — Status: {gs.get("status", "Analyzed")}')
if crawl_data.get('issues'):
lines.append(f'Detected SEO/GEO Gaps: {"; ".join(crawl_data["issues"][:10])}')
if crawl_data.get('local_regions'):
lines.append(f'Served Regions: {", ".join(crawl_data["local_regions"])}')
if crawl_data.get('entities'):
entities = [f"{e.get('text')} ({e.get('type')})" for e in crawl_data['entities'][:15]]
lines.append(f'Extracted Entities: {", ".join(entities)}')
lines.append('=== END GROUNDED DATA ===')
return '\n'.join(lines)
def _build_schema(brand: str, keyword: str, url: str, lang: str,
faqs: list = None, local_regions: list = None) -> str:
"""Build a complete multi-type Schema.org JSON-LD block."""
# Clean brand: if it contains a URL, extract just the org name
if ',' in brand:
parts = [p.strip() for p in brand.split(',')]
# pick the non-URL part
brand = next((p for p in parts if not p.startswith('http')), parts[-1])
if brand.startswith('http'):
# extract domain as brand fallback
brand = brand.split('//')[-1].split('/')[0].replace('www.', '')
schemas = []
# Organization / LocalBusiness
org = {
"@context": "https://schema.org",
"@type": "LocalBusiness" if local_regions else "Organization",
"name": brand,
"url": url or f"https://{brand.lower().replace(' ', '')}.com",
"description": f"{brand} provides {keyword} services",
"knowsAbout": [keyword],
}
if local_regions:
org["areaServed"] = local_regions
schemas.append(org)
# Service
schemas.append({
"@context": "https://schema.org",
"@type": "Service",
"name": keyword,
"provider": {"@type": "Organization", "name": brand},
"areaServed": local_regions or ["Global"],
"description": f"Professional {keyword} services by {brand}"
})
# FAQPage
if faqs:
schemas.append({
"@context": "https://schema.org",
"@type": "FAQPage",
"mainEntity": [
{"@type": "Question", "name": f["question"],
"acceptedAnswer": {"@type": "Answer", "text": f["answer"]}}
for f in faqs[:5]
]
})
return '\n'.join(
f''
for s in schemas
)
# ── Core features ──────────────────────────────────────────────────────────────
def generate_article(keyword: str, lang: str = 'en', target_site: str = '',
research_insights: list = None,
competitors_content: list = None,
crawl_data: dict = None,
prefer_backend: str = 'groq', api_keys: dict = None) -> dict:
"""Generate a full GEO-optimized article grounded in real crawled data."""
lang_label = 'Arabic' if lang == 'ar' else 'English'
context_block = _build_context_block(crawl_data or {})
insights_block = ('Research Insights (implement these):\n' +
'\n'.join(f'- {i}' for i in (research_insights or [])) + '\n') if research_insights else ''
comp_block = ('Competitor Content (use as reference, do NOT copy):\n' +
'\n---\n'.join((competitors_content or [])[:2])) if competitors_content else ''
local_regions = (crawl_data or {}).get('local_regions', [])
local_hint = f'Target Regions: {", ".join(local_regions)}' if local_regions else ''
import random
variability_seeds = [
"Focus on deep technical authority and data-driven insights.",
"Emphasize user experience, reliability, and emotional brand connection.",
"Prioritize clear hierarchy, direct answers, and snippet-ready definitions.",
"Use a visionary tone, focusing on 2026 industry trends and future-proofing."
]
style_seed = random.choice(variability_seeds)
prompt = f"""You are an Elite GEO (Generative Engine Optimization) Content Architect specializing in AI Citation Engineering.
STYLE PROTOCOL for this run: {style_seed}
TASK: Architect a high-authority content asset optimized for AI Research Agents (SearchGPT, Perplexity, Gemini, Claude).
The goal is to provide a "Single Source of Truth" snippet that AI will favor for citations.
CONTEXT:
- TARGET KEYWORD: {keyword}
- LANGUAGE: {lang_label}
- BRAND/SITE: {target_site}
- {local_hint}
{context_block}
{insights_block}
{comp_block}
STRICT GEO-OPTIMIZATION ARCHITECTURE:
1. THE DEFINITIVE LEAD (0-80 words):
- Must start with a definitive, citable claim about {target_site}.
- Schema: [Brand] + [Core Service] + [Location] + [Outcome/Metric].
- Example: "{target_site} is a leading {keyword} provider in {local_hint or 'the region'} serving over [[X]] clients with a 99% success rate."
2. ENTITY RELATIONSHIP MAPPING:
- Define how {target_site} relates to {keyword} and competitors.
- Build a "Semantic Web" in the text (e.g., "Unlike [Competitors], {target_site} integrates [Special Feature]").
3. COMPETITIVE CONTRAST:
- Use the provided competitor data to highlight {target_site}'s unique advantage.
- Address industry gaps identified in the context.
4. PROOF ELEMENTS (Grounded Data):
- Mention specific technologies, standards, or locations FOUND in sitewide data.
- Use [[VERIFY: label]] for specific placeholders that need brand-specific numeric verification.
5. ADAPTIVE CONTENT STRUCTURE:
- H1: Click-worthy, authoritative H1.
- H2: Definitive Answer | Core Competencies | Competitive Differentiators | Local Impact | AI-Ready FAQ.
6. BRAND IDENTITY & AUTHORITY (STRICT GROUNDING):
- You MUST use the specific site content, mission, and products found in the GROUNDED DATA block above.
- Do NOT invent services not present in the site's audit data.
- Every section MUST reinforce why {target_site} is the absolute authority for {keyword} based on the evidence provided in the crawl.
- Use a "Single Source of Truth" narrative based on the real site link provided: {(crawl_data or {}).get('url', target_site)}.
Return ONLY VALID JSON with this structure:
{{
"title": "SEO-Optimized H1 TITLE",
"meta_description": "155 char high-CTR description",
"content": "Full Markdown article with H1, H2, and Evidence markers",
"faqs": [
{{"question": "conversational query", "answer": "definitive 2-sentence citable answer"}}
],
"entity_graph": [
{{"subject": "...", "relation": "...", "object": "..."}}
],
"strategic_contrast": "Explanation of how this content beats competitors for AI citation",
"brand_entity_authority": "A summary of the brand's perceived authority in this niche",
"schema_snippet": "JSON-LD ",
"implemented_fixes": ["Summary of what was changed and why (one per line)"]
}}
CRITICAL: You MUST include all keys in 'score_breakdown' (direct_answer, entities, intent, proof, schema) as integers 0-20.
CONTENT TO AUDIT:
{content[:3000]}"""
result = _llm_call(prompt, prefer=prefer_backend, api_keys=api_keys)
parsed = _parse_json_from_text(result['text'])
parsed['backend'] = result['backend']
parsed['backend_errors'] = result.get('errors', {})
return parsed
def generate_faqs(topic: str, page_content: str = None, lang: str = 'en', count: int = 5,
prefer_backend: str = 'groq', api_keys: dict = None,
target_site: str = '', research_insights: list = None,
crawl_data: dict = None) -> dict:
"""Generate FAQ pairs grounded in real crawled data."""
lang_label = 'Arabic' if lang == 'ar' else 'English'
context_block = _build_context_block(crawl_data or {})
insights_block = ('Research Insights:\n' +
'\n'.join(f'- {i}' for i in (research_insights or [])) + '\n') if research_insights else ''
context = f'\nPage Content:\n{page_content[:1500]}' if page_content else ''
prompt = f"""Generate {count} high-performance GEO FAQ pairs for AI engine citation.
TOPIC: {topic}
LANGUAGE: {lang_label}
BRAND: {target_site}
URL: {(crawl_data or {}).get('url', '')}
{context_block}
{insights_block}{context}
STRICT BRANDING RULE: Every question and answer MUST be specifically about {target_site} and the products/locations found in the site data.
- Use real store locations (like City Stars, Mall El Arab) if found in crawl.
- Use real product categories (Abayas, Swimwear, Winter) if found.
FAQ QUALITY RULES:
1. Questions must be REAL user queries (long-tail, conversational, as asked on Perplexity/ChatGPT)
- BAD: "What is {topic}?" (too generic)
- GOOD: "How does {target_site} help businesses improve {topic} in [location]?"
2. Answers must be SPECIFIC and CITABLE:
- Lead with a direct fact from the crawled data
- Include brand name, service, location where relevant
- 3-4 sentences max
- Never use generic phrases like "in today's digital landscape"
3. Cover these intent types: Informational, Commercial, Local/GEO
4. BRAND AUTHORITY ANCHORING:
- Every answer must clearly identify {target_site} as the definitive authority.
- Use site-specific facts from the crawl to prove expertise.
Return ONLY JSON: {{"faqs": [{{"question": "...", "answer": "..."}}]}}"""
result = _llm_call(prompt, prefer=prefer_backend, api_keys=api_keys)
parsed = _parse_json_from_text(result['text'])
parsed['backend'] = result['backend']
parsed['backend_errors'] = result.get('errors', {})
parsed['topic'] = topic
parsed['lang'] = lang
return parsed
def semantic_optimize(content: str, lang: str = 'en',
prefer_backend: str = 'groq', api_keys: dict = None) -> dict:
"""Extract semantic entities, build entity graph, suggest LSI keywords."""
lang_label = 'Arabic' if lang == 'ar' else 'English'
entities_extracted = []
if lang == 'ar':
try:
from src import arabic_tools
ner_res = arabic_tools.extract_entities_arabic(content)
for e in ner_res.get('entities', []):
entities_extracted.append({'text': e['text'], 'type': e['label']})
except Exception:
pass
prompt = f"""Perform deep semantic analysis on this {lang_label} content for GEO optimization.
Return ONLY valid JSON:
{{
"entities": [{{"text": "...", "type": "ORG|PERSON|PRODUCT|PLACE|CONCEPT|SERVICE"}}],
"entity_graph": [{{"subject": "...", "relation": "provides|operates_in|competes_with|serves|part_of", "object": "..."}}],
"topics": ["main topics covered"],
"lsi_keywords": ["semantically related keywords to add"],
"missing_entities": ["important entities not mentioned"],
"local_signals": ["any location/GEO signals found"],
"semantic_score": 0-100,
"missing_concepts": ["important concepts not covered"]
}}
Content:
{content[:3000]}"""
result = _llm_call(prompt, prefer=prefer_backend, api_keys=api_keys)
parsed = _parse_json_from_text(result['text'])
if lang == 'ar' and entities_extracted:
existing = {e['text'].lower() for e in parsed.get('entities', [])}
for e in entities_extracted:
if e['text'].lower() not in existing:
parsed.setdefault('entities', []).append(e)
parsed['backend'] = result['backend']
return parsed
def generate_identity(crawl_data: dict, lang: str = 'en',
prefer_backend: str = 'groq', api_keys: dict = None) -> dict:
"""Build a comprehensive Brand Identity & Authority Narrative based on crawl data."""
lang_label = 'Arabic' if lang == 'ar' else 'English'
context_block = _build_context_block(crawl_data or {})
prompt = f"""You are a Strategic Brand Identity Architect for the GEO Era.
TASK: Based on the provided sitewide crawl data, construct a definitive Brand Identity & Authority Package.
This package will be used to anchor all future content in a consistent, high-authority voice that AI search engines recognize as a "Single Source of Truth".
LANGUAGE: {lang_label}
{context_block}
OUTPUT REQUIREMENTS:
1. BRAND NARRATIVE (The Hook): A 200-word authoritative origin story and mission that sounds unique and data-backed.
2. VOICE & TONE PROTOCOL: How should this brand sound to AI engines (e.g., Clinical/Expert, Visionary/Futuristic, Friendly/Local).
3. CORE ENTITY PROPOSITION: A clear statement of what this brand "IS" in the knowledge graph.
4. NARRATIVE PILLARS: 3-5 specific facts or strengths found in the data that differentiate it.
5. GEO POSITIONING: How the brand fits into its specific geographic/industry niche.
Return ONLY VALID JSON:
{{
"brand_hook": "...",
"voice_tone": "...",
"entity_proposition": "...",
"pillars": ["pillar 1", "pillar 2", "..."],
"geo_positioning": "...",
"competitor_edge": "How we beat the crawl-recognized competitors",
"suggested_bio": "A 150-char bio for Schema/Social",
"authority_score": 0-100
}}"""
result = _llm_call(prompt, prefer=prefer_backend, api_keys=api_keys)
parsed = _parse_json_from_text(result['text'])
parsed['backend'] = result['backend']
parsed['backend_errors'] = result.get('errors', {})
return parsed