Spaces:
Runtime error
Runtime error
| # Enhanced Content Optimization Module with RAG for GEO | |
| # Integrates RAG functionality for better Generative Engine Optimization | |
| import json | |
| import re | |
| from typing import Dict, Any, List, Optional | |
| from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate | |
| from langchain.schema import Document | |
| class ContentOptimizer: | |
| """Enhanced Content Optimizer with RAG capabilities for GEO""" | |
| def __init__(self, llm, vector_chunker=None): | |
| self.llm = llm | |
| self.vector_chunker = vector_chunker | |
| self.setup_prompts() | |
| self.setup_geo_knowledge_base() | |
| def setup_geo_knowledge_base(self): | |
| """Initialize GEO best practices knowledge base""" | |
| self.geo_knowledge = [ | |
| """ | |
| Generative Engine Optimization (GEO) Best Practices: | |
| 1. Structure for AI Consumption: | |
| - Use clear headings and subheadings | |
| - Include bullet points and numbered lists | |
| - Provide direct, concise answers to common questions | |
| - Use schema markup when possible | |
| 2. Content Format for LLMs: | |
| - Answer questions directly in the first sentence | |
| - Use "what, why, how" question patterns | |
| - Include relevant entities and proper nouns | |
| - Maintain factual accuracy with citations | |
| 3. Semantic Optimization: | |
| - Include related terms and synonyms | |
| - Use entity-rich content (people, places, organizations) | |
| - Connect concepts with clear relationships | |
| - Optimize for topic clusters, not just keywords | |
| """, | |
| """ | |
| AI Search Visibility Optimization: | |
| 1. Query Intent Matching: | |
| - Address user intent explicitly | |
| - Use natural language patterns | |
| - Include question-answer pairs | |
| - Optimize for conversational queries | |
| 2. Citation Worthiness: | |
| - Include authoritative sources and data | |
| - Use specific facts and statistics | |
| - Provide expert opinions and insights | |
| - Maintain consistent tone and expertise | |
| 3. Multi-Query Coverage: | |
| - Address related questions in the same content | |
| - Use comprehensive topic coverage | |
| - Include long-tail and specific queries | |
| - Provide context for complex topics | |
| """, | |
| """ | |
| Content Structure for AI Systems: | |
| 1. Information Architecture: | |
| - Lead with key information | |
| - Use inverted pyramid structure | |
| - Include table of contents for long content | |
| - Break complex topics into digestible sections | |
| 2. Conversational Readiness: | |
| - Write in active voice | |
| - Use clear, direct language | |
| - Include transitional phrases | |
| - Optimize sentence length (12-20 words) | |
| 3. Context Completeness: | |
| - Define technical terms | |
| - Provide background information | |
| - Include relevant examples | |
| - Connect to broader topic context | |
| """ | |
| ] | |
| def setup_prompts(self): | |
| """Initialize optimization prompts with RAG integration""" | |
| self.rag_enhancement_prompt = """ | |
| You are a Generative Engine Optimization (GEO) specialist with access to best practices knowledge. | |
| Based on the provided GEO knowledge and the user's content, optimize the content for: | |
| 1. AI search engines (ChatGPT, Claude, Gemini) | |
| 2. LLM-based question answering systems | |
| 3. Conversational AI interfaces | |
| 4. Citation and reference systems | |
| Use the knowledge base to inform your optimization decisions. | |
| Knowledge Base Context: | |
| {context} | |
| Original Content: | |
| {content} | |
| Provide comprehensive GEO optimization in JSON format: | |
| ```json | |
| {{ | |
| "geo_analysis": {{ | |
| "current_geo_score": 7.5, | |
| "ai_search_visibility": 8.0, | |
| "query_intent_matching": 7.0, | |
| "conversational_readiness": 8.5, | |
| "citation_worthiness": 6.5, | |
| "context_completeness": 7.5 | |
| }}, | |
| "optimization_opportunities": [ | |
| {{ | |
| "type": "Structure Enhancement", | |
| "description": "Add clear headings and Q&A format", | |
| "priority": "high", | |
| "expected_impact": "Improve AI parsing by 25%" | |
| }} | |
| ], | |
| "optimized_content": {{ | |
| "enhanced_text": "Your optimized content here...", | |
| "structural_improvements": ["Added FAQ section", "Improved headings"], | |
| "semantic_enhancements": ["Added related terms", "Improved entity density"] | |
| }}, | |
| "geo_keywords": {{ | |
| "primary_entities": ["entity1", "entity2"], | |
| "semantic_terms": ["term1", "term2"], | |
| "question_patterns": ["What is...", "How does..."], | |
| "related_concepts": ["concept1", "concept2"] | |
| }}, | |
| "recommendations": [ | |
| "Add more specific examples", | |
| "Include authoritative citations", | |
| "Improve conversational flow" | |
| ] | |
| }} | |
| ``` | |
| """.strip() | |
| self.competitive_geo_prompt = """ | |
| Analyze the content against GEO best practices and identify competitive optimization opportunities. | |
| GEO Knowledge Base: | |
| {context} | |
| Content to Analyze: | |
| {content} | |
| Provide competitive GEO analysis: | |
| ```json | |
| {{ | |
| "competitive_gaps": {{ | |
| "missing_question_patterns": ["What questions aren't covered"], | |
| "entity_gaps": ["Important entities not mentioned"], | |
| "semantic_opportunities": ["Related terms to include"], | |
| "structural_weaknesses": ["Formatting issues for AI"] | |
| }}, | |
| "benchmark_comparison": {{ | |
| "current_performance": {{ | |
| "ai_answerability": 6.5, | |
| "semantic_richness": 7.0, | |
| "structural_clarity": 8.0 | |
| }}, | |
| "optimization_potential": {{ | |
| "ai_answerability": 9.0, | |
| "semantic_richness": 8.5, | |
| "structural_clarity": 9.5 | |
| }} | |
| }}, | |
| "action_plan": [ | |
| {{ | |
| "priority": "high", | |
| "action": "Add FAQ section", | |
| "rationale": "Improves direct question answering" | |
| }} | |
| ] | |
| }} | |
| ``` | |
| """.strip() | |
| def optimize_content_with_rag(self, content: str, optimization_type: str = "geo_standard", analyze_only: bool = False) -> Dict[str, Any]: | |
| try: | |
| knowledge_docs = [Document(page_content=k, metadata={"source": "geo_best_practices"}) for k in self.geo_knowledge] | |
| context = "\n\n".join(self.geo_knowledge) | |
| if self.vector_chunker: | |
| qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm) | |
| geo_query = f"How to optimize this type of content for AI search engines: {content[:500]}" | |
| context_result = qa_chain({"query": geo_query}) | |
| context = context_result.get("result", context) | |
| return self._competitive_geo_optimization(content, context) if optimization_type == "competitive_geo" else self._standard_geo_optimization(content, context, analyze_only) | |
| except Exception as e: | |
| return {"error": f"RAG-enhanced optimization failed: {str(e)}"} | |
| def _standard_geo_optimization(self, content: str, context: str, analyze_only: bool) -> Dict[str, Any]: | |
| try: | |
| prompt = ChatPromptTemplate.from_messages([ | |
| SystemMessagePromptTemplate.from_template(self.rag_enhancement_prompt), | |
| HumanMessagePromptTemplate.from_template("Optimize this content using GEO best practices.") | |
| ]) | |
| result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]}) | |
| parsed = self._parse_optimization_result(getattr(result, 'content', str(result))) | |
| parsed.update({ | |
| 'optimization_type': 'geo_standard', | |
| 'rag_enhanced': True, | |
| 'analyze_only': analyze_only, | |
| 'original_length': len(content), | |
| 'knowledge_sources': len(self.geo_knowledge) | |
| }) | |
| return parsed | |
| except Exception as e: | |
| return {"error": f"Standard GEO optimization failed: {str(e)}"} | |
| def _competitive_geo_optimization(self, content: str, context: str) -> Dict[str, Any]: | |
| try: | |
| prompt = ChatPromptTemplate.from_messages([ | |
| SystemMessagePromptTemplate.from_template(self.competitive_geo_prompt), | |
| HumanMessagePromptTemplate.from_template("Perform competitive GEO analysis.") | |
| ]) | |
| result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]}) | |
| parsed = self._parse_optimization_result(getattr(result, 'content', str(result))) | |
| parsed.update({ | |
| 'optimization_type': 'competitive_geo', | |
| 'rag_enhanced': True, | |
| 'competitive_analysis': True | |
| }) | |
| return parsed | |
| except Exception as e: | |
| return {"error": f"Competitive GEO optimization failed: {str(e)}"} | |
| def batch_optimize_with_rag(self, content_list: List[str], optimization_type: str = "geo_standard") -> List[Dict[str, Any]]: | |
| results = [] | |
| for i, content in enumerate(content_list): | |
| try: | |
| result = self.optimize_content_with_rag(content, optimization_type) | |
| result['batch_index'] = i | |
| results.append(result) | |
| except Exception as e: | |
| results.append({ | |
| 'batch_index': i, | |
| 'error': f"Batch GEO optimization failed: {str(e)}" | |
| }) | |
| return results | |
| def analyze_geo_readability(self, content: str) -> Dict[str, Any]: | |
| try: | |
| words = content.split() | |
| sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()] | |
| paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] | |
| metrics = { | |
| 'questions': len(re.findall(r'\?', content)), | |
| 'headings': len(re.findall(r'^#+\s', content, re.MULTILINE)), | |
| 'lists': len(re.findall(r'^\s*[-*+]\s', content, re.MULTILINE)), | |
| 'entities': len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content)), | |
| 'numbers': len(re.findall(r'\b\d+\.?\d*\b', content)), | |
| 'sentence_count': len(sentences), | |
| 'word_count': len(words) | |
| } | |
| geo_score = self._calculate_geo_readability_score({ | |
| 'avg_words_per_sentence': metrics['word_count'] / metrics['sentence_count'] if metrics['sentence_count'] else 0, | |
| 'questions_ratio': metrics['questions'] / metrics['sentence_count'] if metrics['sentence_count'] else 0, | |
| 'structure_elements': metrics['headings'] + metrics['lists'], | |
| 'entity_density': metrics['entities'] / metrics['word_count'] if metrics['word_count'] else 0, | |
| 'numeric_data': metrics['numbers'] / metrics['word_count'] if metrics['word_count'] else 0 | |
| }) | |
| return { | |
| 'geo_readability_metrics': metrics, | |
| 'geo_readability_score': geo_score, | |
| 'geo_recommendations': self._generate_geo_recommendations(metrics) | |
| } | |
| except Exception as e: | |
| return {'error': f"GEO readability analysis failed: {str(e)}"} | |
| def _calculate_geo_readability_score(self, m: Dict[str, float]) -> float: | |
| try: | |
| score = ( | |
| max(0, 10 - abs(m['avg_words_per_sentence'] - 15) * 0.3) * 0.2 + | |
| min(10, m['questions_ratio'] * 50) * 0.25 + | |
| min(10, m['structure_elements'] * 1.5) * 0.25 + | |
| min(10, m['entity_density'] * 100) * 0.15 + | |
| min(10, m['numeric_data'] * 200) * 0.15 | |
| ) | |
| return round(score, 1) | |
| except Exception: | |
| return 5.0 | |
| def _generate_geo_recommendations(self, m: Dict[str, int]) -> List[str]: | |
| r = [] | |
| if m['questions'] == 0: | |
| r.append("Add FAQ section or question-based headings.") | |
| if m['headings'] < 2: | |
| r.append("Use more structured headings.") | |
| if m['lists'] == 0: | |
| r.append("Include bullet points or numbered lists.") | |
| if m['entities'] < 5: | |
| r.append("Add named or topical entities.") | |
| if m['questions'] / m['sentence_count'] < 0.1: | |
| r.append("Transform statements into Q&A pairs.") | |
| return r | |
| def _clean_json_string(self, json_str: str) -> str: | |
| json_str = json_str.replace("...", "") | |
| json_str = re.sub(r",\s*([}\]])", r"\\1", json_str) | |
| json_str = json_str.strip('`') | |
| return json_str | |
| def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]: | |
| try: | |
| start = response_text.find('{') | |
| end = response_text.rfind('}') + 1 | |
| if start != -1 and end != -1: | |
| json_str = self._clean_json_string(response_text[start:end]) | |
| return json.loads(json_str) | |
| return { | |
| 'raw_response': response_text, | |
| 'parsing_error': 'No JSON structure found in response', | |
| 'geo_analysis': {}, | |
| 'recommendations': [] | |
| } | |
| except json.JSONDecodeError as e: | |
| return { | |
| 'raw_response': response_text, | |
| 'parsing_error': f'JSON decode error: {str(e)}', | |
| 'geo_analysis': {}, | |
| 'recommendations': [] | |
| } | |
| except Exception as e: | |
| return { | |
| 'raw_response': response_text, | |
| 'parsing_error': f'Unexpected error: {str(e)}', | |
| 'geo_analysis': {}, | |
| 'recommendations': [] | |
| } | |
| # Legacy support methods | |
| def optimize_content(self, content: str, analyze_only: bool = False, include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]: | |
| return self.optimize_content_with_rag(content, optimization_type, analyze_only) | |
| def analyze_content_readability(self, content: str) -> Dict[str, Any]: | |
| return self.analyze_geo_readability(content) | |