Spaces:
Runtime error
Runtime error
| """ | |
| Enhanced Content Optimization Module with RAG for GEO | |
| Integrates RAG functionality for better Generative Engine Optimization | |
| """ | |
| import json | |
| import re | |
| from typing import Dict, Any, List, Optional | |
| from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate | |
| from langchain.schema import Document | |
| class ContentOptimizer: | |
| """Enhanced Content Optimizer with RAG capabilities for GEO""" | |
| def __init__(self, llm, vector_chunker=None): | |
| self.llm = llm | |
| self.vector_chunker = vector_chunker | |
| self.setup_prompts() | |
| self.setup_geo_knowledge_base() | |
| def setup_geo_knowledge_base(self): | |
| """Initialize GEO best practices knowledge base""" | |
| self.geo_knowledge = [ | |
| """ | |
| Generative Engine Optimization (GEO) Best Practices: | |
| 1. Structure for AI Consumption: | |
| - Use clear headings and subheadings | |
| - Include bullet points and numbered lists | |
| - Provide direct, concise answers to common questions | |
| - Use schema markup when possible | |
| 2. Content Format for LLMs: | |
| - Answer questions directly in the first sentence | |
| - Use "what, why, how" question patterns | |
| - Include relevant entities and proper nouns | |
| - Maintain factual accuracy with citations | |
| 3. Semantic Optimization: | |
| - Include related terms and synonyms | |
| - Use entity-rich content (people, places, organizations) | |
| - Connect concepts with clear relationships | |
| - Optimize for topic clusters, not just keywords | |
| """, | |
| """ | |
| AI Search Visibility Optimization: | |
| 1. Query Intent Matching: | |
| - Address user intent explicitly | |
| - Use natural language patterns | |
| - Include question-answer pairs | |
| - Optimize for conversational queries | |
| 2. Citation Worthiness: | |
| - Include authoritative sources and data | |
| - Use specific facts and statistics | |
| - Provide expert opinions and insights | |
| - Maintain consistent tone and expertise | |
| 3. Multi-Query Coverage: | |
| - Address related questions in the same content | |
| - Use comprehensive topic coverage | |
| - Include long-tail and specific queries | |
| - Provide context for complex topics | |
| """, | |
| """ | |
| Content Structure for AI Systems: | |
| 1. Information Architecture: | |
| - Lead with key information | |
| - Use inverted pyramid structure | |
| - Include table of contents for long content | |
| - Break complex topics into digestible sections | |
| 2. Conversational Readiness: | |
| - Write in active voice | |
| - Use clear, direct language | |
| - Include transitional phrases | |
| - Optimize sentence length (12-20 words) | |
| 3. Context Completeness: | |
| - Define technical terms | |
| - Provide background information | |
| - Include relevant examples | |
| - Connect to broader topic context | |
| """ | |
| ] | |
| def setup_prompts(self): | |
| """Initialize optimization prompts with RAG integration""" | |
| self.rag_enhancement_prompt = """ | |
| You are a Generative Engine Optimization (GEO) specialist with access to best practices knowledge. | |
| Based on the provided GEO knowledge and the user's content, optimize the content for: | |
| 1. AI search engines (ChatGPT, Claude, Gemini) | |
| 2. LLM-based question answering systems | |
| 3. Conversational AI interfaces | |
| 4. Citation and reference systems | |
| Use the knowledge base to inform your optimization decisions. | |
| Knowledge Base Context: | |
| {context} | |
| Original Content: | |
| {content} | |
| Provide comprehensive GEO optimization in JSON format: | |
| ```json | |
| {{ | |
| "geo_analysis": {{ | |
| "current_geo_score": 7.5, | |
| "ai_search_visibility": 8.0, | |
| "query_intent_matching": 7.0, | |
| "conversational_readiness": 8.5, | |
| "citation_worthiness": 6.5, | |
| "context_completeness": 7.5 | |
| }}, | |
| "optimization_opportunities": [ | |
| {{ | |
| "type": "Structure Enhancement", | |
| "description": "Add clear headings and Q&A format", | |
| "priority": "high", | |
| "expected_impact": "Improve AI parsing by 25%" | |
| }} | |
| ], | |
| "optimized_content": {{ | |
| "enhanced_text": "Your optimized content here...", | |
| "structural_improvements": ["Added FAQ section", "Improved headings"], | |
| "semantic_enhancements": ["Added related terms", "Improved entity density"] | |
| }}, | |
| "geo_keywords": {{ | |
| "primary_entities": ["entity1", "entity2"], | |
| "semantic_terms": ["term1", "term2"], | |
| "question_patterns": ["What is...", "How does..."], | |
| "related_concepts": ["concept1", "concept2"] | |
| }}, | |
| "recommendations": [ | |
| "Add more specific examples", | |
| "Include authoritative citations", | |
| "Improve conversational flow" | |
| ] | |
| }} | |
| ``` | |
| """ | |
| self.competitive_geo_prompt = """ | |
| Analyze the content against GEO best practices and identify competitive optimization opportunities. | |
| GEO Knowledge Base: | |
| {context} | |
| Content to Analyze: | |
| {content} | |
| Provide competitive GEO analysis: | |
| ```json | |
| {{ | |
| "competitive_gaps": {{ | |
| "missing_question_patterns": ["What questions aren't covered"], | |
| "entity_gaps": ["Important entities not mentioned"], | |
| "semantic_opportunities": ["Related terms to include"], | |
| "structural_weaknesses": ["Formatting issues for AI"] | |
| }}, | |
| "benchmark_comparison": {{ | |
| "current_performance": {{ | |
| "ai_answerability": 6.5, | |
| "semantic_richness": 7.0, | |
| "structural_clarity": 8.0 | |
| }}, | |
| "optimization_potential": {{ | |
| "ai_answerability": 9.0, | |
| "semantic_richness": 8.5, | |
| "structural_clarity": 9.5 | |
| }} | |
| }}, | |
| "action_plan": [ | |
| {{ | |
| "priority": "high", | |
| "action": "Add FAQ section", | |
| "rationale": "Improves direct question answering" | |
| }} | |
| ] | |
| }} | |
| ``` | |
| """ | |
| def optimize_content_with_rag(self, content: str, optimization_type: str = "geo_standard", | |
| analyze_only: bool = False) -> Dict[str, Any]: | |
| """ | |
| Main RAG-enhanced content optimization for GEO | |
| Args: | |
| content (str): Content to optimize | |
| optimization_type (str): Type of GEO optimization | |
| analyze_only (bool): Whether to only analyze without rewriting | |
| Returns: | |
| Dict: Comprehensive GEO optimization results | |
| """ | |
| try: | |
| # Create knowledge base documents | |
| knowledge_docs = [Document(page_content=knowledge, metadata={"source": "geo_best_practices"}) | |
| for knowledge in self.geo_knowledge] | |
| if self.vector_chunker: | |
| # Use RAG to get relevant knowledge | |
| qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm) | |
| # Query for relevant GEO practices | |
| geo_query = f"How to optimize this type of content for AI search engines: {content[:500]}" | |
| context_result = qa_chain({"query": geo_query}) | |
| context = context_result.get("result", "") | |
| else: | |
| # Fallback to using all knowledge if vector_chunker not available | |
| context = "\n\n".join(self.geo_knowledge) | |
| # Choose optimization approach | |
| if optimization_type == "competitive_geo": | |
| return self._competitive_geo_optimization(content, context) | |
| else: | |
| return self._standard_geo_optimization(content, context, analyze_only) | |
| except Exception as e: | |
| return {'error': f"RAG-enhanced optimization failed: {str(e)}"} | |
| def _standard_geo_optimization(self, content: str, context: str, analyze_only: bool) -> Dict[str, Any]: | |
| """Standard GEO optimization with RAG context""" | |
| try: | |
| prompt_template = ChatPromptTemplate.from_messages([ | |
| SystemMessagePromptTemplate.from_template(self.rag_enhancement_prompt), | |
| HumanMessagePromptTemplate.from_template("Optimize this content using GEO best practices.") | |
| ]) | |
| chain = prompt_template | self.llm | |
| result = chain.invoke({ | |
| "context": context, | |
| "content": content[:5000] # Limit content length | |
| }) | |
| result_content = result.content if hasattr(result, 'content') else str(result) | |
| parsed_result = self._parse_optimization_result(result_content) | |
| # Add metadata | |
| parsed_result.update({ | |
| 'optimization_type': 'geo_standard', | |
| 'rag_enhanced': True, | |
| 'analyze_only': analyze_only, | |
| 'original_length': len(content), | |
| 'knowledge_sources': len(self.geo_knowledge) | |
| }) | |
| return parsed_result | |
| except Exception as e: | |
| return {'error': f"Standard GEO optimization failed: {str(e)}"} | |
| def _competitive_geo_optimization(self, content: str, context: str) -> Dict[str, Any]: | |
| """Competitive GEO analysis with RAG context""" | |
| try: | |
| prompt_template = ChatPromptTemplate.from_messages([ | |
| SystemMessagePromptTemplate.from_template(self.competitive_geo_prompt), | |
| HumanMessagePromptTemplate.from_template("Perform competitive GEO analysis.") | |
| ]) | |
| chain = prompt_template | self.llm | |
| result = chain.invoke({ | |
| "context": context, | |
| "content": content[:5000] | |
| }) | |
| result_content = result.content if hasattr(result, 'content') else str(result) | |
| parsed_result = self._parse_optimization_result(result_content) | |
| parsed_result.update({ | |
| 'optimization_type': 'competitive_geo', | |
| 'rag_enhanced': True, | |
| 'competitive_analysis': True | |
| }) | |
| return parsed_result | |
| except Exception as e: | |
| return {'error': f"Competitive GEO optimization failed: {str(e)}"} | |
| def batch_optimize_with_rag(self, content_list: List[str], optimization_type: str = "geo_standard") -> List[Dict[str, Any]]: | |
| """ | |
| Batch optimize multiple content pieces with RAG | |
| Args: | |
| content_list: List of content to optimize | |
| optimization_type: Type of optimization | |
| Returns: | |
| List of optimization results | |
| """ | |
| results = [] | |
| for i, content in enumerate(content_list): | |
| try: | |
| result = self.optimize_content_with_rag( | |
| content, | |
| optimization_type=optimization_type | |
| ) | |
| result['batch_index'] = i | |
| results.append(result) | |
| except Exception as e: | |
| results.append({ | |
| 'batch_index': i, | |
| 'error': f"Batch GEO optimization failed: {str(e)}" | |
| }) | |
| return results | |
| def analyze_geo_readability(self, content: str) -> Dict[str, Any]: | |
| """ | |
| Analyze content readability specifically for GEO/AI systems | |
| """ | |
| try: | |
| # Basic metrics | |
| words = content.split() | |
| sentences = re.split(r'[.!?]+', content) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] | |
| # GEO-specific analysis | |
| questions = len(re.findall(r'\?', content)) | |
| headings = len(re.findall(r'^#+\s', content, re.MULTILINE)) | |
| lists = len(re.findall(r'^\s*[-*+]\s', content, re.MULTILINE)) | |
| numbers = len(re.findall(r'\b\d+\.?\d*\b', content)) | |
| # Entity-like patterns (proper nouns) | |
| entities = len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content)) | |
| # Calculate GEO readability score | |
| geo_score = self._calculate_geo_readability_score({ | |
| 'avg_words_per_sentence': len(words) / len(sentences) if sentences else 0, | |
| 'questions_ratio': questions / len(sentences) if sentences else 0, | |
| 'structure_elements': headings + lists, | |
| 'entity_density': entities / len(words) if words else 0, | |
| 'numeric_data': numbers / len(words) if words else 0 | |
| }) | |
| return { | |
| 'geo_readability_metrics': { | |
| 'total_words': len(words), | |
| 'total_sentences': len(sentences), | |
| 'total_paragraphs': len(paragraphs), | |
| 'questions_count': questions, | |
| 'headings_count': headings, | |
| 'lists_count': lists, | |
| 'entity_mentions': entities, | |
| 'numeric_data_points': numbers | |
| }, | |
| 'geo_readability_score': geo_score, | |
| 'ai_optimization_indicators': { | |
| 'question_ratio': questions / len(sentences) if sentences else 0, | |
| 'structure_score': min(10, (headings + lists) * 2), | |
| 'entity_density': entities / len(words) if words else 0, | |
| 'data_richness': numbers / len(words) if words else 0 | |
| }, | |
| 'geo_recommendations': self._generate_geo_recommendations({ | |
| 'questions': questions, | |
| 'headings': headings, | |
| 'lists': lists, | |
| 'entities': entities, | |
| 'sentences': len(sentences) | |
| }) | |
| } | |
| except Exception as e: | |
| return {'error': f"GEO readability analysis failed: {str(e)}"} | |
| def extract_geo_entities(self, content: str) -> Dict[str, Any]: | |
| """ | |
| Extract entities and concepts relevant for GEO optimization | |
| """ | |
| try: | |
| if not self.vector_chunker: | |
| return {'error': 'Vector chunker not available for entity extraction'} | |
| # Create knowledge context about entity extraction | |
| entity_knowledge = [Document( | |
| page_content=""" | |
| For GEO optimization, important entities include: | |
| 1. Named entities: People, organizations, locations, brands | |
| 2. Technical concepts: Industry terms, methodologies, tools | |
| 3. Topical entities: Core subjects, themes, categories | |
| 4. Relational entities: Connected concepts, dependencies | |
| 5. Question entities: What users commonly ask about | |
| """, | |
| metadata={"source": "entity_extraction_guide"} | |
| )] | |
| qa_chain = self.vector_chunker.create_qa_chain(entity_knowledge, self.llm) | |
| # Extract different types of entities | |
| extraction_queries = [ | |
| "What are the main named entities (people, places, organizations) in this content?", | |
| "What are the key technical concepts and terms?", | |
| "What questions might users have about this content?", | |
| "What related topics and concepts are mentioned?" | |
| ] | |
| extracted_data = {} | |
| for query in extraction_queries: | |
| full_query = f"{query}\n\nContent: {content[:3000]}" | |
| result = qa_chain({"query": full_query}) | |
| query_key = query.split('?')[0].lower().replace(' ', '_').replace('what_are_the_', '') | |
| extracted_data[query_key] = result.get("result", "") | |
| return { | |
| 'geo_entities': extracted_data, | |
| 'extraction_method': 'rag_enhanced', | |
| 'content_length': len(content), | |
| 'extraction_success': True | |
| } | |
| except Exception as e: | |
| return {'error': f"GEO entity extraction failed: {str(e)}"} | |
| def generate_geo_variations(self, content: str, num_variations: int = 3) -> List[Dict[str, Any]]: | |
| """ | |
| Generate GEO-optimized content variations using RAG | |
| """ | |
| variations = [] | |
| variation_types = [ | |
| ("faq_focused", "Transform into FAQ format optimized for AI Q&A systems"), | |
| ("conversational", "Optimize for conversational AI and voice search"), | |
| ("authoritative", "Enhance with authoritative tone for citation systems") | |
| ] | |
| try: | |
| # Get GEO context | |
| knowledge_docs = [Document(page_content=knowledge, metadata={"source": "geo_practices"}) | |
| for knowledge in self.geo_knowledge] | |
| if self.vector_chunker: | |
| qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm) | |
| for i, (variation_type, description) in enumerate(variation_types[:num_variations]): | |
| try: | |
| # Get specific guidance for this variation type | |
| context_query = f"How to optimize content for {variation_type} in AI systems?" | |
| context_result = qa_chain({"query": context_query}) | |
| context = context_result.get("result", "") | |
| variation_prompt = f""" | |
| Create a {variation_type} version of the content optimized for GEO. | |
| Context: {context} | |
| Original Content: {content[:4000]} | |
| Variation Goal: {description} | |
| Return JSON: | |
| {{ | |
| "variation_type": "{variation_type}", | |
| "optimized_content": "the rewritten content...", | |
| "geo_improvements": ["improvement 1", "improvement 2"], | |
| "target_ai_systems": ["ChatGPT", "Claude", "etc"], | |
| "expected_geo_benefits": ["benefit 1", "benefit 2"] | |
| }} | |
| """ | |
| prompt_template = ChatPromptTemplate.from_messages([ | |
| SystemMessagePromptTemplate.from_template(variation_prompt), | |
| HumanMessagePromptTemplate.from_template("Generate the GEO-optimized variation.") | |
| ]) | |
| chain = prompt_template | self.llm | |
| result = chain.invoke({}) | |
| result_content = result.content if hasattr(result, 'content') else str(result) | |
| parsed_result = self._parse_optimization_result(result_content) | |
| parsed_result.update({ | |
| 'variation_index': i, | |
| 'rag_enhanced': True, | |
| 'geo_optimized': True | |
| }) | |
| variations.append(parsed_result) | |
| except Exception as e: | |
| variations.append({ | |
| 'variation_index': i, | |
| 'variation_type': variation_type, | |
| 'error': f"GEO variation generation failed: {str(e)}" | |
| }) | |
| else: | |
| return [{'error': 'Vector chunker not available for variation generation'}] | |
| except Exception as e: | |
| return [{'error': f"GEO variation generation failed: {str(e)}"}] | |
| return variations | |
| def _calculate_geo_readability_score(self, metrics: Dict[str, float]) -> float: | |
| """Calculate GEO-specific readability score""" | |
| try: | |
| # GEO-optimized scoring | |
| sentence_score = max(0, 10 - abs(metrics['avg_words_per_sentence'] - 15) * 0.3) | |
| question_score = min(10, metrics['questions_ratio'] * 50) # Reward questions | |
| structure_score = min(10, metrics['structure_elements'] * 1.5) # Reward headings/lists | |
| entity_score = min(10, metrics['entity_density'] * 100) # Reward entities | |
| data_score = min(10, metrics['numeric_data'] * 200) # Reward data points | |
| # Weighted for GEO priorities | |
| overall_score = ( | |
| sentence_score * 0.2 + | |
| question_score * 0.25 + | |
| structure_score * 0.25 + | |
| entity_score * 0.15 + | |
| data_score * 0.15 | |
| ) | |
| return round(overall_score, 1) | |
| except Exception: | |
| return 5.0 | |
| def _generate_geo_recommendations(self, metrics: Dict[str, int]) -> List[str]: | |
| """Generate GEO-specific recommendations""" | |
| recommendations = [] | |
| try: | |
| if metrics['questions'] == 0: | |
| recommendations.append("Add FAQ section or question-based headings for better AI Q&A performance") | |
| if metrics['headings'] < 2: | |
| recommendations.append("Add more structured headings to improve AI content parsing") | |
| if metrics['lists'] == 0: | |
| recommendations.append("Include bullet points or numbered lists for better information extraction") | |
| if metrics['entities'] < 5: | |
| recommendations.append("Include more specific entities (names, places, organizations) for authority") | |
| if metrics['questions'] / metrics['sentences'] < 0.1: | |
| recommendations.append("Consider transforming statements into question-answer pairs") | |
| return recommendations | |
| except Exception: | |
| return ["Unable to generate specific GEO recommendations"] | |
| def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]: | |
| """Parse LLM response and extract structured results""" | |
| try: | |
| # Find JSON content in the response | |
| json_start = response_text.find('{') | |
| json_end = response_text.rfind('}') + 1 | |
| if json_start != -1 and json_end != -1: | |
| json_str = response_text[json_start:json_end] | |
| parsed = json.loads(json_str) | |
| return parsed | |
| else: | |
| # If no JSON found, return structured error | |
| return { | |
| 'raw_response': response_text, | |
| 'parsing_error': 'No JSON structure found in response', | |
| 'geo_analysis': { | |
| 'current_geo_score': 0, | |
| 'ai_search_visibility': 0, | |
| 'query_intent_matching': 0, | |
| 'conversational_readiness': 0, | |
| 'citation_worthiness': 0, | |
| 'context_completeness': 0 | |
| } | |
| } | |
| except json.JSONDecodeError as e: | |
| return { | |
| 'raw_response': response_text, | |
| 'parsing_error': f'JSON decode error: {str(e)}', | |
| 'geo_analysis': { | |
| 'current_geo_score': 0, | |
| 'ai_search_visibility': 0, | |
| 'query_intent_matching': 0, | |
| 'conversational_readiness': 0, | |
| 'citation_worthiness': 0, | |
| 'context_completeness': 0 | |
| } | |
| } | |
| except Exception as e: | |
| return { | |
| 'raw_response': response_text, | |
| 'parsing_error': f'Unexpected parsing error: {str(e)}', | |
| 'geo_analysis': { | |
| 'current_geo_score': 0, | |
| 'ai_search_visibility': 0, | |
| 'query_intent_matching': 0, | |
| 'conversational_readiness': 0, | |
| 'citation_worthiness': 0, | |
| 'context_completeness': 0 | |
| } | |
| } | |
| # Legacy methods for backward compatibility | |
| def optimize_content(self, content: str, analyze_only: bool = False, | |
| include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]: | |
| """ | |
| Legacy method - redirects to RAG-enhanced optimization | |
| """ | |
| if optimization_type == "standard": | |
| return self.optimize_content_with_rag(content, "geo_standard", analyze_only) | |
| elif optimization_type == "seo": | |
| return self.optimize_content_with_rag(content, "geo_standard", analyze_only) | |
| elif optimization_type == "competitive": | |
| return self.optimize_content_with_rag(content, "competitive_geo", analyze_only) | |
| else: | |
| return self.optimize_content_with_rag(content, "geo_standard", analyze_only) | |
| def analyze_content_readability(self, content: str) -> Dict[str, Any]: | |
| """Legacy method - redirects to GEO readability analysis""" | |
| return self.analyze_geo_readability(content) |