MBilal-72's picture
clean json
69bc060 verified
raw
history blame
15.2 kB
# Enhanced Content Optimization Module with RAG for GEO
# Integrates RAG functionality for better Generative Engine Optimization
import json
import re
from typing import Dict, Any, List, Optional
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.schema import Document
class ContentOptimizer:
"""Enhanced Content Optimizer with RAG capabilities for GEO"""
def __init__(self, llm, vector_chunker=None):
self.llm = llm
self.vector_chunker = vector_chunker
self.setup_prompts()
self.setup_geo_knowledge_base()
def setup_geo_knowledge_base(self):
"""Initialize GEO best practices knowledge base"""
self.geo_knowledge = [
"""
Generative Engine Optimization (GEO) Best Practices:
1. Structure for AI Consumption:
- Use clear headings and subheadings
- Include bullet points and numbered lists
- Provide direct, concise answers to common questions
- Use schema markup when possible
2. Content Format for LLMs:
- Answer questions directly in the first sentence
- Use "what, why, how" question patterns
- Include relevant entities and proper nouns
- Maintain factual accuracy with citations
3. Semantic Optimization:
- Include related terms and synonyms
- Use entity-rich content (people, places, organizations)
- Connect concepts with clear relationships
- Optimize for topic clusters, not just keywords
""",
"""
AI Search Visibility Optimization:
1. Query Intent Matching:
- Address user intent explicitly
- Use natural language patterns
- Include question-answer pairs
- Optimize for conversational queries
2. Citation Worthiness:
- Include authoritative sources and data
- Use specific facts and statistics
- Provide expert opinions and insights
- Maintain consistent tone and expertise
3. Multi-Query Coverage:
- Address related questions in the same content
- Use comprehensive topic coverage
- Include long-tail and specific queries
- Provide context for complex topics
""",
"""
Content Structure for AI Systems:
1. Information Architecture:
- Lead with key information
- Use inverted pyramid structure
- Include table of contents for long content
- Break complex topics into digestible sections
2. Conversational Readiness:
- Write in active voice
- Use clear, direct language
- Include transitional phrases
- Optimize sentence length (12-20 words)
3. Context Completeness:
- Define technical terms
- Provide background information
- Include relevant examples
- Connect to broader topic context
"""
]
def setup_prompts(self):
"""Initialize optimization prompts with RAG integration"""
self.rag_enhancement_prompt = """
You are a Generative Engine Optimization (GEO) specialist with access to best practices knowledge.
Based on the provided GEO knowledge and the user's content, optimize the content for:
1. AI search engines (ChatGPT, Claude, Gemini)
2. LLM-based question answering systems
3. Conversational AI interfaces
4. Citation and reference systems
Use the knowledge base to inform your optimization decisions.
Knowledge Base Context:
{context}
Original Content:
{content}
Provide comprehensive GEO optimization in JSON format:
```json
{{
"geo_analysis": {{
"current_geo_score": 7.5,
"ai_search_visibility": 8.0,
"query_intent_matching": 7.0,
"conversational_readiness": 8.5,
"citation_worthiness": 6.5,
"context_completeness": 7.5
}},
"optimization_opportunities": [
{{
"type": "Structure Enhancement",
"description": "Add clear headings and Q&A format",
"priority": "high",
"expected_impact": "Improve AI parsing by 25%"
}}
],
"optimized_content": {{
"enhanced_text": "Your optimized content here...",
"structural_improvements": ["Added FAQ section", "Improved headings"],
"semantic_enhancements": ["Added related terms", "Improved entity density"]
}},
"geo_keywords": {{
"primary_entities": ["entity1", "entity2"],
"semantic_terms": ["term1", "term2"],
"question_patterns": ["What is...", "How does..."],
"related_concepts": ["concept1", "concept2"]
}},
"recommendations": [
"Add more specific examples",
"Include authoritative citations",
"Improve conversational flow"
]
}}
```
""".strip()
self.competitive_geo_prompt = """
Analyze the content against GEO best practices and identify competitive optimization opportunities.
GEO Knowledge Base:
{context}
Content to Analyze:
{content}
Provide competitive GEO analysis:
```json
{{
"competitive_gaps": {{
"missing_question_patterns": ["What questions aren't covered"],
"entity_gaps": ["Important entities not mentioned"],
"semantic_opportunities": ["Related terms to include"],
"structural_weaknesses": ["Formatting issues for AI"]
}},
"benchmark_comparison": {{
"current_performance": {{
"ai_answerability": 6.5,
"semantic_richness": 7.0,
"structural_clarity": 8.0
}},
"optimization_potential": {{
"ai_answerability": 9.0,
"semantic_richness": 8.5,
"structural_clarity": 9.5
}}
}},
"action_plan": [
{{
"priority": "high",
"action": "Add FAQ section",
"rationale": "Improves direct question answering"
}}
]
}}
```
""".strip()
def optimize_content_with_rag(self, content: str, optimization_type: str = "geo_standard", analyze_only: bool = False) -> Dict[str, Any]:
try:
knowledge_docs = [Document(page_content=k, metadata={"source": "geo_best_practices"}) for k in self.geo_knowledge]
context = "\n\n".join(self.geo_knowledge)
if self.vector_chunker:
qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm)
geo_query = f"How to optimize this type of content for AI search engines: {content[:500]}"
context_result = qa_chain({"query": geo_query})
context = context_result.get("result", context)
return self._competitive_geo_optimization(content, context) if optimization_type == "competitive_geo" else self._standard_geo_optimization(content, context, analyze_only)
except Exception as e:
return {"error": f"RAG-enhanced optimization failed: {str(e)}"}
def _standard_geo_optimization(self, content: str, context: str, analyze_only: bool) -> Dict[str, Any]:
try:
prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(self.rag_enhancement_prompt),
HumanMessagePromptTemplate.from_template("Optimize this content using GEO best practices.")
])
result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]})
parsed = self._parse_optimization_result(getattr(result, 'content', str(result)))
parsed.update({
'optimization_type': 'geo_standard',
'rag_enhanced': True,
'analyze_only': analyze_only,
'original_length': len(content),
'knowledge_sources': len(self.geo_knowledge)
})
return parsed
except Exception as e:
return {"error": f"Standard GEO optimization failed: {str(e)}"}
def _competitive_geo_optimization(self, content: str, context: str) -> Dict[str, Any]:
try:
prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(self.competitive_geo_prompt),
HumanMessagePromptTemplate.from_template("Perform competitive GEO analysis.")
])
result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]})
parsed = self._parse_optimization_result(getattr(result, 'content', str(result)))
parsed.update({
'optimization_type': 'competitive_geo',
'rag_enhanced': True,
'competitive_analysis': True
})
return parsed
except Exception as e:
return {"error": f"Competitive GEO optimization failed: {str(e)}"}
def batch_optimize_with_rag(self, content_list: List[str], optimization_type: str = "geo_standard") -> List[Dict[str, Any]]:
results = []
for i, content in enumerate(content_list):
try:
result = self.optimize_content_with_rag(content, optimization_type)
result['batch_index'] = i
results.append(result)
except Exception as e:
results.append({
'batch_index': i,
'error': f"Batch GEO optimization failed: {str(e)}"
})
return results
def analyze_geo_readability(self, content: str) -> Dict[str, Any]:
try:
words = content.split()
sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
metrics = {
'questions': len(re.findall(r'\?', content)),
'headings': len(re.findall(r'^#+\s', content, re.MULTILINE)),
'lists': len(re.findall(r'^\s*[-*+]\s', content, re.MULTILINE)),
'entities': len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content)),
'numbers': len(re.findall(r'\b\d+\.?\d*\b', content)),
'sentence_count': len(sentences),
'word_count': len(words)
}
geo_score = self._calculate_geo_readability_score({
'avg_words_per_sentence': metrics['word_count'] / metrics['sentence_count'] if metrics['sentence_count'] else 0,
'questions_ratio': metrics['questions'] / metrics['sentence_count'] if metrics['sentence_count'] else 0,
'structure_elements': metrics['headings'] + metrics['lists'],
'entity_density': metrics['entities'] / metrics['word_count'] if metrics['word_count'] else 0,
'numeric_data': metrics['numbers'] / metrics['word_count'] if metrics['word_count'] else 0
})
return {
'geo_readability_metrics': metrics,
'geo_readability_score': geo_score,
'geo_recommendations': self._generate_geo_recommendations(metrics)
}
except Exception as e:
return {'error': f"GEO readability analysis failed: {str(e)}"}
def _calculate_geo_readability_score(self, m: Dict[str, float]) -> float:
try:
score = (
max(0, 10 - abs(m['avg_words_per_sentence'] - 15) * 0.3) * 0.2 +
min(10, m['questions_ratio'] * 50) * 0.25 +
min(10, m['structure_elements'] * 1.5) * 0.25 +
min(10, m['entity_density'] * 100) * 0.15 +
min(10, m['numeric_data'] * 200) * 0.15
)
return round(score, 1)
except Exception:
return 5.0
def _generate_geo_recommendations(self, m: Dict[str, int]) -> List[str]:
r = []
if m['questions'] == 0:
r.append("Add FAQ section or question-based headings.")
if m['headings'] < 2:
r.append("Use more structured headings.")
if m['lists'] == 0:
r.append("Include bullet points or numbered lists.")
if m['entities'] < 5:
r.append("Add named or topical entities.")
if m['questions'] / m['sentence_count'] < 0.1:
r.append("Transform statements into Q&A pairs.")
return r
def _clean_json_string(self, json_str: str) -> str:
json_str = json_str.replace("...", "")
json_str = re.sub(r",\s*([}\]])", r"\\1", json_str)
json_str = json_str.strip('`')
return json_str
def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]:
try:
start = response_text.find('{')
end = response_text.rfind('}') + 1
if start != -1 and end != -1:
json_str = self._clean_json_string(response_text[start:end])
return json.loads(json_str)
return {
'raw_response': response_text,
'parsing_error': 'No JSON structure found in response',
'geo_analysis': {},
'recommendations': []
}
except json.JSONDecodeError as e:
return {
'raw_response': response_text,
'parsing_error': f'JSON decode error: {str(e)}',
'geo_analysis': {},
'recommendations': []
}
except Exception as e:
return {
'raw_response': response_text,
'parsing_error': f'Unexpected error: {str(e)}',
'geo_analysis': {},
'recommendations': []
}
# Legacy support methods
def optimize_content(self, content: str, analyze_only: bool = False, include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]:
return self.optimize_content_with_rag(content, optimization_type, analyze_only)
def analyze_content_readability(self, content: str) -> Dict[str, Any]:
return self.analyze_geo_readability(content)