Spaces:

MBilal-72
/

GenerativeEngineOptimization

Runtime error

App Files Files Community

geo-jawad

by Alpha108 - opened Jul 25, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+104

-5637

Files changed (8) hide show

app.py +104 -976
demo.json +0 -58
requirements.txt +0 -6
utils/chunker.py +0 -1314
utils/export.py +0 -1896
utils/optimizer.py +0 -354
utils/parser.py +0 -549
utils/scorer.py +0 -484

app.py CHANGED Viewed

@@ -1,983 +1,111 @@
-"""
-Main Streamlit Application - GEO SEO AI Optimizer with RAG-Enhanced Content Optimization
-Entry point for the application with UI components
-"""
-import streamlit as st
 import os
 import tempfile
-import json
-from typing import Dict, Any, List
-import time
-# Import our custom modules
-from utils.parser import PDFParser, TextParser, WebpageParser
-from utils.scorer import GEOScorer
-from utils.optimizer import ContentOptimizer  # This will be your enhanced version
-from utils.chunker import VectorChunker
-from utils.export import ResultExporter
-# Import LangChain components
-from langchain_groq import ChatGroq
 from langchain_community.embeddings import HuggingFaceEmbeddings
-from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
-from langchain_core.messages import AIMessage, HumanMessage
-class GEOSEOApp:
-    """Main application class that orchestrates all components"""
-    def __init__(self):
-        self.setup_config()
-        self.setup_models()
-        self.setup_parsers()
-        self.setup_components()
-    def setup_config(self):
-        """Initialize configuration and API keys"""
-        self.groq_api_key = os.getenv("GROQ_API_KEY", "your-groq-api-key")
-        self.hf_api_key = os.getenv("HUGGINGFACE_API_KEY", "your-huggingface-api-key")
-        # Create data directory if it doesn't exist
-        os.makedirs("data/uploaded_files", exist_ok=True)
-    def setup_models(self):
-        """Initialize LLM and embedding models"""
-        self.llm = ChatGroq(
-            api_key=self.groq_api_key,
-            model_name="llama-3.1-8b-instant",
-            temperature=0.1
-        )
-        self.embeddings = HuggingFaceEmbeddings(
-             model_name="sentence-transformers/all-MiniLM-L6-v2",
-             model_kwargs={"device": "cpu"}
-            # model_name="sentence-transformers/all-MiniLM-L6-v2",
-            # model_kwargs={"device": "cpu"},
-            # cache_folder="./hf_caches",
-        )
-    def setup_parsers(self):
-        """Initialize content parsers"""
-        self.pdf_parser = PDFParser()
-        self.text_parser = TextParser()
-        self.webpage_parser = WebpageParser()
-    def setup_components(self):
-        """Initialize processing components with RAG integration"""
-        self.geo_scorer = GEOScorer(self.llm)
-        self.vector_chunker = VectorChunker(self.embeddings)
-        # Enhanced content optimizer with RAG capabilities
-        self.content_optimizer = ContentOptimizer(self.llm, self.vector_chunker)
-        self.result_exporter = ResultExporter()
-    def run(self):
-        """Main application runner"""
-        st.set_page_config(
-            page_title="GEO SEO AI Optimizer",
-            page_icon="🚀",
-            layout="wide"
-        )
-        st.title("🚀 GEO SEO AI Optimizer")
-        st.markdown("*Optimize your content for AI search engines and LLM systems with RAG-enhanced analysis*")
-        # Sidebar
-        self.render_sidebar()
-        # Main tabs
-        tab1, tab2, tab3, tab4 = st.tabs([
-            "🌐 Website GEO Analysis",
-            "🔧 GEO Content Enhancement",
-            "📄 Document Q&A",
-            "🧠 Generate GEO Content",
-        ])
-        with tab1:
-            self.render_website_analysis_tab()
-        with tab2:
-            self.render_geo_content_enhancement_tab()
-        with tab3:
-            self.render_document_qa_tab()
-        with tab4:
-            self.render_generate_geo_content_tab()
-    def render_sidebar(self):
-        """Render sidebar with information and controls"""
-        st.sidebar.title("🛠️ GEO Tools")
-        st.sidebar.markdown("- 🌐 Website GEO Analysis")
-        st.sidebar.markdown("- 🔧 RAG-Enhanced Content Optimization")
-        st.sidebar.markdown("- 📊 AI-First SEO Scoring")
-        st.sidebar.markdown("- 📄 Document Q&A with RAG")
-        st.sidebar.markdown("- 🧠 Generate GEO Content")
-        st.sidebar.markdown("---")
-        st.sidebar.markdown("### 📖 GEO Metrics")
-        st.sidebar.markdown("**AI Search Visibility**: How likely AI engines will surface your content")
-        st.sidebar.markdown("**Query Intent Matching**: How well content matches user queries")
-        st.sidebar.markdown("**Conversational Readiness**: Suitability for AI chat responses")
-        st.sidebar.markdown("**Citation Worthiness**: Probability of being cited by AI")
-        st.sidebar.markdown("**Context Completeness**: How self-contained the content is")
-        st.sidebar.markdown("**Semantic Richness**: Depth of topic coverage")
-        st.sidebar.markdown("---")
-        st.sidebar.markdown("### 🧠 RAG Enhancement")
-        st.sidebar.markdown("- **Knowledge Base**: GEO best practices")
-        st.sidebar.markdown("- **Contextual Analysis**: AI-informed optimization")
-        st.sidebar.markdown("- **Entity Extraction**: AI-powered entity recognition")
-        st.sidebar.markdown("- **Competitive Analysis**: Gap identification")
-    def render_geo_content_enhancement_tab(self):
-        """Render GEO Content Enhancement tab with RAG integration"""
-        st.header("🔧 GEO Content Enhancement with RAG")
-        st.markdown("Analyze and optimize your content using AI-powered Generative Engine Optimization with RAG-enhanced knowledge base.")
-        # Content input
-        input_text = st.text_area(
-            "Enter content to analyze and enhance:",
-            height=200,
-            key="geo_enhancement_input",
-            help="Paste your content here for GEO optimization using RAG-enhanced analysis"
-        )
-        # GEO Optimization type selector
-        st.markdown("### ⚙️ GEO Optimization Settings")
-        col1, col2 = st.columns(2)
-        with col1:
-            optimization_type = st.selectbox(
-                "Select GEO Optimization Type:",
-                options=[
-                    "geo_standard",
-                    # "competitive_geo",
-                    # "geo_readability",
-                    # "geo_entity_extraction",
-                    # "geo_variations",
-                    # "geo_batch_optimize"
-                ],
-                format_func=lambda x: {
-                    "geo_standard": "🔧 Standard GEO Enhancement",
-                    # "competitive_geo": "📊 Competitive GEO Analysis",
-                    # "geo_readability": "📖 GEO Readability Analysis",
-                    # "geo_entity_extraction": "🏷️ GEO Entity Extraction",
-                    # "geo_variations": "🔄 GEO Content Variations",
-                    # "geo_batch_optimize": "📦 Batch GEO Optimization"
-                }[x],
-                index=0,
-                help="Choose the type of GEO optimization powered by RAG analysis"
-            )
-        with col2:
-            # Additional options based on optimization type
-            if optimization_type in ["geo_standard", "competitive_geo"]:
-                analyze_only = st.checkbox("Analysis", value=True)
-                include_rag_context = st.checkbox("Include RAG context details", value=True)
-            # elif optimization_type == "geo_variations":
-            #     num_variations = st.slider("Number of variations", min_value=1, max_value=3, value=2)
-            #     analyze_only = False
-            #     include_rag_context = True
-            # elif optimization_type == "geo_batch_optimize":
-            #     st.info("For batch optimization, separate multiple content pieces with '---' divider")
-            #     analyze_only = False
-            #     include_rag_context = True
-            else:
-                analyze_only = False
-                include_rag_context = True
-        # Show description based on optimization type
-        optimization_descriptions = {
-            "geo_standard": "🔧 RAG-enhanced GEO optimization focusing on AI search visibility, conversational readiness, and citation worthiness using knowledge base guidance.",
-            # "competitive_geo": "📊 Competitive GEO analysis against best practices with gap identification and actionable recommendations using RAG context.",
-            # "geo_readability": "📖 Detailed readability analysis specifically optimized for AI systems and LLM consumption patterns.",
-            # "geo_entity_extraction": "🏷️ AI-powered extraction of key entities, topics, and concepts relevant for GEO optimization.",
-            # "geo_variations": "🔄 Generate multiple GEO-optimized variations (FAQ, conversational, authoritative) using RAG knowledge.",
-            # "geo_batch_optimize": "📦 Process multiple content pieces simultaneously with consistent GEO optimization."
-        }
-        st.info(f"**{optimization_descriptions[optimization_type]}**")
-        # Knowledge base status
-        if hasattr(self.content_optimizer, 'geo_knowledge'):
-            st.success(f"✅ RAG Knowledge Base Loaded: {len(self.content_optimizer.geo_knowledge)} GEO best practice documents")
-        else:
-            st.warning("⚠️ RAG Knowledge Base not available - falling back to standard optimization")
-        # Submit button
-        if st.button("🚀 Process Content with GEO+RAG", key="geo_enhancement_submit"):
-            if not input_text.strip():
-                st.warning("Please enter some content to analyze.")
-                return
-            try:
-                with st.spinner(f"Processing content with {optimization_type} using RAG-enhanced GEO analysis..."):
-                    # Handle different GEO optimization types
-                    if optimization_type == "geo_standard":
-                        result = self.content_optimizer.optimize_content_with_rag(
-                            input_text,
-                            optimization_type="geo_standard",
-                            analyze_only=analyze_only
-                        )
-                    elif optimization_type == "competitive_geo":
-                        result = self.content_optimizer.optimize_content_with_rag(
-                            input_text,
-                            optimization_type="competitive_geo",
-                            analyze_only=analyze_only
-                        )
-                    elif optimization_type == "geo_readability":
-                        result = self.content_optimizer.analyze_geo_readability(input_text)
-                    elif optimization_type == "geo_entity_extraction":
-                        result = self.content_optimizer.extract_geo_entities(input_text)
-                    elif optimization_type == "geo_variations":
-                        result = self.content_optimizer.generate_geo_variations(
-                            input_text,
-                            num_variations=num_variations
-                        )
-                    elif optimization_type == "geo_batch_optimize":
-                        # Split content by '---' separator
-                        content_pieces = [piece.strip() for piece in input_text.split('---') if piece.strip()]
-                        if len(content_pieces) > 1:
-                            result = self.content_optimizer.batch_optimize_with_rag(content_pieces)
-                        else:
-                            st.warning("For batch optimization, please separate content pieces with '---'")
-                            return
-                if isinstance(result, list):
-                    # Handle list results (variations, batch)
-                    if any(r.get("error") for r in result):
-                        failed_results = [r for r in result if r.get("error")]
-                        st.error(f"Some processing failed: {len(failed_results)} out of {len(result)} items")
-                    else:
-                        st.success("All content processed successfully!")
-                elif result.get("error"):
-                    st.error(f"Processing failed: {result['error']}")
-                    return
-                else:
-                    st.success(f"{optimization_type.replace('_', ' ').title()} completed successfully!")
-                # Display results based on optimization type
-                self.display_geo_enhancement_results(result, optimization_type, input_text, include_rag_context)
-            except Exception as e:
-                st.error(f"An error occurred: {str(e)}")
-    def display_geo_enhancement_results(self, result, optimization_type, original_text, include_rag_context=True):
-        """Display results based on GEO optimization type"""
-        if optimization_type == "geo_batch_optimize":
-            self.display_geo_batch_results(result)
-        elif optimization_type == "geo_variations":
-            self.display_geo_variation_results(result)
-        elif optimization_type == "geo_readability":
-            self.display_geo_readability_results(result)
-        elif optimization_type == "geo_entity_extraction":
-            self.display_geo_entity_results(result)
-        else:
-            self.display_standard_geo_results(result, optimization_type, include_rag_context)
-        # Export functionality
-        self.display_geo_export_options(result, optimization_type, original_text)
-    def display_standard_geo_results(self, result, optimization_type, include_rag_context):
-        """Display results for standard and competitive GEO optimizations"""
-        st.markdown("### 📊 GEO Analysis Results")
-        # Show GEO scores if available
-        geo_analysis = result.get("geo_analysis", {})
-        if geo_analysis:
-            st.markdown("#### 🎯 GEO Performance Metrics")
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                current_score = geo_analysis.get("current_geo_score", 0)
-                st.metric("Overall GEO Score", f"{current_score}/10")
-            with col2:
-                ai_visibility = geo_analysis.get("ai_search_visibility", 0)
-                st.metric("AI Search Visibility", f"{ai_visibility}/10")
-            with col3:
-                citation_worthy = geo_analysis.get("citation_worthiness", 0)
-                st.metric("Citation Worthiness", f"{citation_worthy}/10")
-            # Second row of metrics
-            col1, col2, col3 = st.columns(3)
-            with col1:
-                query_matching = geo_analysis.get("query_intent_matching", 0)
-                st.metric("Query Intent Match", f"{query_matching}/10")
-            with col2:
-                conversational = geo_analysis.get("conversational_readiness", 0)
-                st.metric("Conversational Ready", f"{conversational}/10")
-            with col3:
-                context_complete = geo_analysis.get("context_completeness", 0)
-                st.metric("Context Complete", f"{context_complete}/10")
-        # Show optimization opportunities
-        opportunities = result.get("optimization_opportunities", [])
-        if opportunities:
-            st.markdown("#### 🚀 Optimization Opportunities")
-            high_priority = [opp for opp in opportunities if opp.get('priority') == 'high']
-            medium_priority = [opp for opp in opportunities if opp.get('priority') == 'medium']
-            if high_priority:
-                st.markdown("##### 🔴 High Priority")
-                for opp in high_priority:
-                    st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', '')}")
-                    if opp.get('expected_impact'):
-                        st.write(f"*Expected Impact: {opp.get('expected_impact')}*")
-                    st.write("---")
-            if medium_priority:
-                st.markdown("##### 🟡 Medium Priority")
-                for opp in medium_priority:
-                    st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', '')}")
-                    if opp.get('expected_impact'):
-                        st.write(f"*Expected Impact: {opp.get('expected_impact')}*")
-                    st.write("---")
-        # Show GEO keywords and entities
-        geo_keywords = result.get("geo_keywords", {})
-        if geo_keywords:
-            st.markdown("#### 🔑 GEO Keywords & Entities")
-            col1, col2 = st.columns(2)
-            with col1:
-                primary_entities = geo_keywords.get("primary_entities", [])
-                if primary_entities:
-                    st.write("**Primary Entities:**")
-                    st.write(", ".join(primary_entities))
-                semantic_terms = geo_keywords.get("semantic_terms", [])
-                if semantic_terms:
-                    st.write("**Semantic Terms:**")
-                    st.write(", ".join(semantic_terms))
-            with col2:
-                question_patterns = geo_keywords.get("question_patterns", [])
-                if question_patterns:
-                    st.write("**Question Patterns:**")
-                    for q in question_patterns:
-                        st.write(f"• {q}")
-                related_concepts = geo_keywords.get("related_concepts", [])
-                if related_concepts:
-                    st.write("**Related Concepts:**")
-                    st.write(", ".join(related_concepts))
-        # Show optimized content
-        optimized_content = result.get("optimized_content", {})
-        if optimized_content:
-            enhanced_text = optimized_content.get("enhanced_text", "")
-            if enhanced_text:
-                st.markdown("#### ✨ GEO-Optimized Content")
-                st.text_area(
-                    "Enhanced version:",
-                    value=enhanced_text,
-                    height=250,
-                    key="geo_optimized_output"
-                )
-            # Show structural improvements
-            structural_improvements = optimized_content.get("structural_improvements", [])
-            if structural_improvements:
-                st.markdown("**Structural Improvements:**")
-                for improvement in structural_improvements:
-                    st.write(f"• {improvement}")
-            # Show semantic enhancements
-            semantic_enhancements = optimized_content.get("semantic_enhancements", [])
-            if semantic_enhancements:
-                st.markdown("**Semantic Enhancements:**")
-                for enhancement in semantic_enhancements:
-                    st.write(f"• {enhancement}")
-        # Show competitive analysis if available
-        if "competitive_gaps" in result:
-            st.markdown("#### 📊 Competitive GEO Analysis")
-            competitive_gaps = result["competitive_gaps"]
-            col1, col2 = st.columns(2)
-            with col1:
-                missing_questions = competitive_gaps.get("missing_question_patterns", [])
-                if missing_questions:
-                    st.write("**Missing Question Patterns:**")
-                    for q in missing_questions:
-                        st.write(f"• {q}")
-                entity_gaps = competitive_gaps.get("entity_gaps", [])
-                if entity_gaps:
-                    st.write("**Entity Gaps:**")
-                    st.write(", ".join(entity_gaps))
-            with col2:
-                semantic_opportunities = competitive_gaps.get("semantic_opportunities", [])
-                if semantic_opportunities:
-                    st.write("**Semantic Opportunities:**")
-                    st.write(", ".join(semantic_opportunities))
-                structural_weaknesses = competitive_gaps.get("structural_weaknesses", [])
-                if structural_weaknesses:
-                    st.write("**Structural Weaknesses:**")
-                    for weakness in structural_weaknesses:
-                        st.write(f"• {weakness}")
-        # Show recommendations
-        recommendations = result.get("recommendations", [])
-        if recommendations:
-            st.markdown("#### 💡 GEO Recommendations")
-            for i, rec in enumerate(recommendations, 1):
-                st.write(f"**{i}.** {rec}")
-        # RAG context information
-        if include_rag_context and result.get("rag_enhanced"):
-            with st.expander("🧠 RAG Enhancement Details"):
-                st.write("**RAG Status:** ✅ Knowledge base successfully applied")
-                st.write(f"**Knowledge Sources:** {result.get('knowledge_sources', 'Multiple')} GEO best practice documents")
-                st.write(f"**Enhancement Type:** {result.get('optimization_type', 'Standard')}")
-                if result.get('parsing_error'):
-                    st.warning(f"**Parsing Note:** {result['parsing_error']}")
-    def display_geo_batch_results(self, results):
-        """Display batch GEO optimization results"""
-        st.markdown("### 📦 Batch GEO Processing Results")
-        successful_results = [r for r in results if not r.get('error')]
-        failed_results = [r for r in results if r.get('error')]
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Total Pieces", len(results))
-        with col2:
-            st.metric("Successful", len(successful_results))
-        with col3:
-            st.metric("Failed", len(failed_results))
-        # Show individual results
-        for result in results:
-            idx = result.get('batch_index', 0)
-            st.markdown(f"#### Content Piece {idx + 1}")
-            if result.get('error'):
-                st.error(f"Processing failed: {result['error']}")
-            else:
-                # Show GEO scores
-                geo_analysis = result.get("geo_analysis", {})
-                if geo_analysis:
-                    col1, col2, col3 = st.columns(3)
-                    with col1:
-                        st.metric("GEO Score", f"{geo_analysis.get('current_geo_score', 0):.1f}")
-                    with col2:
-                        st.metric("AI Visibility", f"{geo_analysis.get('ai_search_visibility', 0):.1f}")
-                    with col3:
-                        st.metric("Citation Worthy", f"{geo_analysis.get('citation_worthiness', 0):.1f}")
-                # Show optimized content if available
-                optimized_content = result.get("optimized_content", {})
-                enhanced_text = optimized_content.get("enhanced_text", "")
-                if enhanced_text:
-                    with st.expander("View GEO-optimized content"):
-                        st.text_area("", value=enhanced_text[:500] + "...", height=150, key=f"batch_geo_output_{idx}")
-            st.write("---")
-    def display_geo_variation_results(self, variations):
-        """Display GEO content variation results"""
-        st.markdown("### 🔄 GEO Content Variations")
-        for i, variation in enumerate(variations):
-            if variation.get('error'):
-                st.error(f"Variation {i+1} failed: {variation['error']}")
-                continue
-            variation_type = variation.get('variation_type', f'Variation {i+1}')
-            st.markdown(f"#### {variation_type.replace('_', ' ').title()} Version")
-            # Show GEO improvements
-            geo_improvements = variation.get('geo_improvements', [])
-            if geo_improvements:
-                st.write("**GEO Improvements:**")
-                for improvement in geo_improvements:
-                    st.write(f"• {improvement}")
-            # Show target AI systems
-            target_ai_systems = variation.get('target_ai_systems', [])
-            if target_ai_systems:
-                st.write(f"**Optimized For:** {', '.join(target_ai_systems)}")
-            # Show expected benefits
-            expected_benefits = variation.get('expected_geo_benefits', [])
-            if expected_benefits:
-                st.write("**Expected GEO Benefits:**")
-                for benefit in expected_benefits:
-                    st.write(f"• {benefit}")
-            # Show optimized content
-            optimized_content = variation.get('optimized_content', '')
-            if optimized_content:
-                st.text_area(
-                    f"{variation_type} content:",
-                    value=optimized_content,
-                    height=200,
-                    key=f"geo_variation_{i}"
-                )
-            st.write("---")
-    def display_geo_readability_results(self, result):
-        """Display GEO readability analysis results"""
-        st.markdown("### 📖 GEO Readability Analysis")
-        # Basic GEO metrics
-        geo_metrics = result.get('geo_readability_metrics', {})
-        if geo_metrics:
-            st.markdown("#### 📊 GEO Content Metrics")
-            col1, col2, col3, col4 = st.columns(4)
-            with col1:
-                st.metric("Total Words", geo_metrics.get('total_words', 0))
-            with col2:
-                st.metric("Questions", geo_metrics.get('questions_count', 0))
-            with col3:
-                st.metric("Headings", geo_metrics.get('headings_count', 0))
-            with col4:
-                st.metric("Lists", geo_metrics.get('lists_count', 0))
-            # Second row
-            col1, col2, col3, col4 = st.columns(4)
-            with col1:
-                st.metric("Entity Mentions", geo_metrics.get('entity_mentions', 0))
-            with col2:
-                st.metric("Data Points", geo_metrics.get('numeric_data_points', 0))
-            with col3:
-                st.metric("Paragraphs", geo_metrics.get('total_paragraphs', 0))
-            with col4:
-                geo_score = result.get('geo_readability_score', 0)
-                st.metric("GEO Readability", f"{geo_score}/10")
-        # AI optimization indicators
-        ai_indicators = result.get('ai_optimization_indicators', {})
-        if ai_indicators:
-            st.markdown("#### 🤖 AI Optimization Indicators")
-            col1, col2 = st.columns(2)
-            with col1:
-                question_ratio = ai_indicators.get('question_ratio', 0)
-                st.metric("Question Ratio", f"{question_ratio:.2%}")
-                structure_score = ai_indicators.get('structure_score', 0)
-                st.metric("Structure Score", f"{structure_score:.1f}/10")
-            with col2:
-                entity_density = ai_indicators.get('entity_density', 0)
-                st.metric("Entity Density", f"{entity_density:.2%}")
-                data_richness = ai_indicators.get('data_richness', 0)
-                st.metric("Data Richness", f"{data_richness:.2%}")
-        # GEO recommendations
-        geo_recommendations = result.get('geo_recommendations', [])
-        if geo_recommendations:
-            st.markdown("#### 💡 GEO Optimization Recommendations")
-            for i, rec in enumerate(geo_recommendations, 1):
-                st.write(f"**{i}.** {rec}")
-    def display_geo_entity_results(self, result):
-        """Display GEO entity extraction results"""
-        st.markdown("### 🏷️ GEO Entity Analysis")
-        if result.get('error'):
-            st.error(f"Entity extraction failed: {result['error']}")
-            return
-        geo_entities = result.get('geo_entities', {})
-        if geo_entities:
-            # Display extracted entities
-            for entity_type, entity_data in geo_entities.items():
-                if entity_data:
-                    st.markdown(f"#### {entity_type.replace('_', ' ').title()}")
-                    st.write(entity_data)
-                    st.write("---")
-        # Extraction metadata
-        extraction_success = result.get('extraction_success', False)
-        if extraction_success:
-            st.success("✅ Entity extraction completed successfully")
-            st.write(f"**Content Length:** {result.get('content_length', 0)} characters")
-            st.write(f"**Extraction Method:** {result.get('extraction_method', 'Unknown')}")
-    def display_geo_export_options(self, result, optimization_type, original_text):
-        """Display export options for GEO results"""
-        st.markdown("### 📥 Export GEO Results")
-        # Prepare export data
-        export_data = {
-            'timestamp': time.time(),
-            'optimization_type': optimization_type,
-            'original_text': original_text,
-            'original_word_count': len(original_text.split()),
-            'geo_results': result,
-            'rag_enhanced': result.get('rag_enhanced', False) if not isinstance(result, list) else any(r.get('rag_enhanced', False) for r in result),
-            'knowledge_sources': result.get('knowledge_sources', 0) if not isinstance(result, list) else 'multiple'
-        }
-        # Serialize data to JSON
-        export_json = json.dumps(export_data, indent=2, default=str)
-        # Add download button
-        st.download_button(
-            label="📥 Download GEO Analysis Report",
-            data=export_json,
-            file_name=f"geo_{optimization_type}_analysis_{int(time.time())}.json",
-            mime="application/json"
-        )
-    # Keep existing methods for other tabs (render_document_qa_tab, render_website_analysis_tab, etc.)
-    # ... (rest of the methods remain the same as in your original code)
-    def render_document_qa_tab(self):
-        """Render Document Q&A tab"""
-        st.header("📄 Document Question Answering")
-        st.markdown("Upload documents or paste text to ask questions using RAG.")
-        # File upload
-        uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
-        # Text input
-        pasted_text = st.text_area("Or paste text directly:", height=150)
-        # Question input
-        user_query = st.text_input("Ask a question about the content:")
-        # Submit button
-        if st.button("🔍 Ask Question", key="qa_submit"):
-            if not user_query.strip():
-                st.warning("Please enter a question.")
-                return
-            try:
-                # Parse content
-                documents = []
-                if uploaded_file:
-                    with st.spinner("Processing PDF..."):
-                        # Save uploaded file temporarily
-                        temp_path = self.save_uploaded_file(uploaded_file)
-                        documents = self.pdf_parser.parse(temp_path)
-                        os.unlink(temp_path)  # Clean up
-                elif pasted_text.strip():
-                    with st.spinner("Processing text..."):
-                        documents = self.text_parser.parse(pasted_text)
-                else:
-                    st.warning("Please upload a PDF or paste some text.")
-                    return
-                # Create vector store and answer question
-                with st.spinner("Creating embeddings and searching..."):
-                    qa_chain = self.vector_chunker.create_qa_chain(documents, self.llm)
-                    result = qa_chain({"query": user_query})
-                # Display results
-                st.markdown("### 💬 Answer")
-                st.write(result["result"])
-                # Show sources
-                with st.expander("📄 Source Documents"):
-                    for i, doc in enumerate(result.get("source_documents", [])):
-                        st.write(f"**Source {i+1}:**")
-                        content = doc.page_content
-                        st.write(content[:500] + "..." if len(content) > 500 else content)
-                        if hasattr(doc, 'metadata') and doc.metadata:
-                            st.write(f"*Metadata: {doc.metadata}*")
-                        st.write("---")
-            except Exception as e:
-                st.error(f"An error occurred: {str(e)}")
-    def render_website_analysis_tab(self):
-        """Render Website GEO Analysis tab"""
-        st.header("🌐 Website GEO Analysis")
-        st.markdown("Analyze websites for Generative Engine Optimization (GEO) performance.")
-        # URL input
-        col1, col2 = st.columns([3, 1])
-        with col1:
-            website_url = st.text_input(
-                "Enter website URL:",
-                placeholder="https://example.com"
-            )
-        with col2:
-            max_pages = st.selectbox("Pages to analyze:", [1, 3, 5], index=0)
-        # Analysis options
-        col1, col2 = st.columns(2)
-        with col1:
-            include_subpages = st.checkbox("Include subpages", value=False)
-        with col2:
-            detailed_analysis = st.checkbox("Detailed analysis", value=True)
-        # Submit button
-        if st.button("🌐 Analyze Website", key="website_analyze"):
-            if not website_url.strip():
-                st.warning("Please enter a website URL.")
-                return
-            try:
-                # Normalize URL
-                if not website_url.startswith(('http://', 'https://')):
-                    website_url = 'https://' + website_url
-                with st.spinner(f"Analyzing website: {website_url}"):
-                    # Parse website content
-                    pages_data = self.webpage_parser.parse_website(
-                        website_url,
-                        max_pages=max_pages,
-                        include_subpages=include_subpages
-                    )
-                    if not pages_data:
-                        st.error("Could not extract content from the website.")
-                        return
-                    st.success(f"Successfully extracted content from {len(pages_data)} page(s)")
-                # Analyze GEO scores
-                with st.spinner("Calculating GEO scores..."):
-                    geo_results = []
-                    for i, page_data in enumerate(pages_data):
-                        with st.spinner(f"Analyzing page {i+1}/{len(pages_data)}..."):
-                            analysis = self.geo_scorer.analyze_page_geo(
-                                page_data['content'],
-                                page_data['title'],
-                                detailed=detailed_analysis
-                            )
-                            if not analysis.get('error'):
-                                analysis['page_data'] = page_data
-                                geo_results.append(analysis)
-                            else:
-                                st.warning(f"Could not analyze page {i+1}: {analysis['error']}")
-                if not geo_results:
-                    st.error("Could not analyze any pages from the website.")
-                    return
-                # Display results
-                self.display_geo_results(geo_results, website_url)
-                # Export functionality
-                st.markdown("### 📥 Export Results")
-                if st.button("📊 Generate Full Report"):
-                    report_data = self.result_exporter.export_geo_results(
-                        geo_results,
-                        website_url
-                    )
-                    st.download_button(
-                        label="Download GEO Report",
-                        data=json.dumps(report_data, indent=2),
-                        file_name=f"geo_analysis_{website_url.replace('https://', '').replace('/', '_')}.json",
-                        mime="application/json"
-                    )
-            except Exception as e:
-                st.error(f"An error occurred during website analysis: {str(e)}")
-    def display_geo_results(self, geo_results: List[Dict], website_url: str):
-        """Display GEO analysis results"""
-        st.markdown("## 📊 GEO Analysis Results")
-        # Calculate average scores
-        avg_scores = self.calculate_average_scores(geo_results)
-        overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
-        # Main score display
-        col1, col2, col3 = st.columns([1, 2, 1])
-        with col2:
-            st.metric(
-                "Overall GEO Score",
-                f"{overall_avg:.1f}/10",
-                delta=f"{overall_avg - 7.0:.1f}" if overall_avg != 7.0 else None
-            )
-        # Individual metrics
-        st.markdown("### 📈 Detailed GEO Metrics")
-        # First row of metrics
-        col1, col2, col3, col4 = st.columns(4)
-        metrics_row1 = [
-            ("AI Search Visibility", "ai_search_visibility"),
-            ("Query Intent Match", "query_intent_matching"),
-            ("Factual Accuracy", "factual_accuracy"),
-            ("Conversational Ready", "conversational_readiness")
-        ]
-        for i, (display_name, key) in enumerate(metrics_row1):
-            with [col1, col2, col3, col4][i]:
-                score = avg_scores.get(key, 0)
-                st.metric(display_name, f"{score:.1f}")
-        # Second row of metrics
-        col1, col2, col3, col4 = st.columns(4)
-        metrics_row2 = [
-            ("Semantic Richness", "semantic_richness"),
-            ("Context Complete", "context_completeness"),
-            ("Citation Worthy", "citation_worthiness"),
-            ("Multi-Query Cover", "multi_query_coverage")
-        ]
-        for i, (display_name, key) in enumerate(metrics_row2):
-            with [col1, col2, col3, col4][i]:
-                score = avg_scores.get(key, 0)
-                st.metric(display_name, f"{score:.1f}")
-        # Recommendations
-        self.display_recommendations(geo_results)
-        # Detailed page analysis
-        with st.expander("📋 Detailed Page Analysis"):
-            for i, analysis in enumerate(geo_results):
-                page_data = analysis.get('page_data', {})
-                st.markdown(f"#### Page {i+1}: {page_data.get('title', 'Unknown Title')}")
-                st.write(f"**URL**: {page_data.get('url', 'Unknown')}")
-                st.write(f"**Word Count**: {page_data.get('word_count', 0)}")
-                # Show topics and entities if available
-                if 'primary_topics' in analysis:
-                    st.write(f"**Topics**: {', '.join(analysis['primary_topics'])}")
-                if 'entities' in analysis:
-                    st.write(f"**Entities**: {', '.join(analysis['entities'])}")
-                # Show page-specific scores
-                if 'geo_scores' in analysis:
-                    scores = analysis['geo_scores']
-                    score_text = ", ".join([f"{k}: {v:.1f}" for k, v in scores.items()])
-                    st.write(f"**Scores**: {score_text}")
-                st.write("---")
-    def display_recommendations(self, geo_results: List[Dict]):
-        """Display optimization recommendations"""
-        st.markdown("### 💡 Optimization Recommendations")
-        # Collect all recommendations
-        all_recommendations = []
-        all_opportunities = []
-        for analysis in geo_results:
-            all_recommendations.extend(analysis.get('recommendations', []))
-            all_opportunities.extend(analysis.get('optimization_opportunities', []))
-        # Remove duplicates and display
-        unique_recommendations = list(set(all_recommendations))
-        if unique_recommendations:
-            for i, rec in enumerate(unique_recommendations[:5], 1):
-                st.write(f"**{i}.** {rec}")
-        # Priority opportunities
-        if all_opportunities:
-            st.markdown("#### 🚀 Priority Optimizations")
-            high_priority = [opp for opp in all_opportunities if opp.get('priority') == 'high']
-            medium_priority = [opp for opp in all_opportunities if opp.get('priority') == 'medium']
-            if high_priority:
-                st.markdown("##### 🔴 High Priority")
-                for opp in high_priority[:3]:
-                    st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
-            if medium_priority:
-                st.markdown("##### 🟡 Medium Priority")
-                for opp in medium_priority[:3]:
-                    st.write(f"**{opp.get('type', 'Optimization')}**: {opp.get('description', 'No description')}")
-    def calculate_average_scores(self, geo_results: List[Dict]) -> Dict[str, float]:
-        """Calculate average GEO scores across all pages"""
-        if not geo_results:
-            return {}
-        # Get all score keys from the first result
-        score_keys = list(geo_results[0].get('geo_scores', {}).keys())
-        avg_scores = {}
-        for key in score_keys:
-            scores = [
-                result['geo_scores'][key]
-                for result in geo_results
-                if 'geo_scores' in result and key in result['geo_scores']
-            ]
-            avg_scores[key] = sum(scores) / len(scores) if scores else 0
-        return avg_scores
-    def save_uploaded_file(self, uploaded_file) -> str:
-        """Save uploaded file to temporary location"""
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
             tmp_file.write(uploaded_file.read())
-            return tmp_file.name
-    def render_generate_geo_content_tab(self):
-        """Tab to generate fresh GEO-optimized content using system prompts"""
-        st.header("🧠 Generate GEO Content")
-        st.markdown("Use this tool to generate AI-optimized content from scratch based on your topic or query.")
-        # User input
-        user_prompt = st.text_area("Describe the content you want (e.g., topic, style, target audience):", height=150)
-        # Continue chat option
-        if "chat_history" not in st.session_state:
-            st.session_state.chat_history = []
-        if st.button("🧠 Generate Content"):
-            if not user_prompt.strip():
-                st.warning("Please enter a topic or description.")
-                return
-            # Add user message to chat history
-            st.session_state.chat_history.append(HumanMessage(content=user_prompt))
-            # Define system prompt for GEO content generation
-            system_prompt = (
-                "You are a Generative Engine Optimization (GEO) content creation specialist. "
-                "Create content that is highly optimized for AI systems, LLMs, and generative search engines. "
-                "Ensure the content includes rich semantics, clear structure, relevant keywords, and is suitable for conversational use, citations, and AI summaries."
-            )
-            st.session_state.chat_history.insert(0, SystemMessagePromptTemplate.from_template(system_prompt).format())
-            with st.spinner("Generating GEO-optimized content..."):
-                response = self.llm.invoke(st.session_state.chat_history)
-                st.session_state.chat_history.append(AIMessage(content=response.content))
-                st.success("✅ Content generated successfully!")
-        # Display chat history
-        for msg in st.session_state.chat_history:
-            if isinstance(msg, HumanMessage):
-                st.markdown(f"**🧑 You:** {msg.content}")
-            elif isinstance(msg, AIMessage):
-                st.markdown(f"**🤖 Assistant:** {msg.content}")
-def main():
-    """Main entry point"""
-    app = GEOSEOApp()
-    app.run()
-if __name__ == "__main__":
-    main()

 import os
 import tempfile
+import streamlit as st
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.schema import Document
+# from langchain_groq import GroqLLM
+from langchain_groq import ChatGroq
+# --- Environment Variables ---
+GROQ_API_KEY = os.getenv("GROQ_API_KEY", "your-groq-api-key")
+HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY", "your-huggingface-api-key")
+# --- Initialize Groq LLM ---
+# llm = GroqLLM(
+#     api_key=GROQ_API_KEY,
+#     model="llama3-8b-8192",
+#     temperature=0.1
+# )
+llm = ChatGroq(
+    api_key=GROQ_API_KEY,
+    model_name="llama3-8b-8192",  # Note: it's `model_name` not `model`
+    temperature=0.1
+)
+# --- HuggingFace Embeddings ---
+embedding = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/all-MiniLM-L6-v2",
+    cache_folder="./hf_cache",
+    # huggingfacehub_api_token=HUGGINGFACE_API_KEY
+)
+# embedding = HuggingFaceEmbeddings(
+#     model_name="sentence-transformers/all-MiniLM-L6-v2"
+# )
+# --- Streamlit UI ---
+st.title("📄📥 Chat with PDF or Text using Groq + RAG")
+# Option to upload PDF
+uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
+# Option to paste raw text
+pasted_text = st.text_area("Or paste some text below:")
+# User's question
+user_query = st.text_input("Ask a question about the content")
+# Submit button
+submit_button = st.button("Submit")
+if submit_button:
+    documents = []
+    # Handle uploaded PDF
+    if uploaded_file:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
             tmp_file.write(uploaded_file.read())
+            tmp_path = tmp_file.name
+        loader = PyPDFLoader(tmp_path)
+        documents = loader.load_and_split()
+    # Handle pasted text if no PDF
+    elif pasted_text.strip():
+        documents = [Document(page_content=pasted_text)]
+    else:
+        st.warning("Please upload a PDF or paste some text.")
+        st.stop()
+    # Create vector store
+    vectorstore = FAISS.from_documents(documents, embedding)
+    retriever = vectorstore.as_retriever()
+    # Optional custom prompt
+    prompt_template = PromptTemplate(
+        input_variables=["context", "question"],
+        template="""
+        You are an AI assistant. Use the following context to answer the question.
+        Be concise, accurate, and helpful.
+        Context: {context}
+        Question: {question}
+        Answer:"""
+    )
+    # QA Chain
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=retriever,
+        return_source_documents=True,
+        chain_type_kwargs={"prompt": prompt_template}
+    )
+    # Run QA
+    result = qa_chain({"query": user_query})
+    # Show result
+    st.markdown("### 💬 Answer")
+    st.write(result["result"])
+    # Show sources (only if from PDF)
+    if uploaded_file:
+        with st.expander("📄 Sources"):
+            for i, doc in enumerate(result["source_documents"]):
+                st.write(f"**Page {i+1}** — {doc.metadata.get('source', 'Unknown')}")

demo.json DELETED Viewed

@@ -1,58 +0,0 @@
-{
-  "website_url": "https://example.com",
-  "geo_results": [
-    {
-      "page_data": {
-        "url": "https://example.com/page1",
-        "title": "Example Page 1",
-        "word_count": 500
-      },
-      "geo_scores": {
-        "ai_search_visibility": 7.5,
-        "query_intent_matching": 8.0,
-        "factual_accuracy": 9.0,
-        "conversational_readiness": 6.5,
-        "semantic_richness": 7.0,
-        "context_completeness": 8.5,
-        "citation_worthiness": 7.8,
-        "multi_query_coverage": 6.0
-      },
-      "overall_geo_score": 7.5,
-      "primary_topics": ["SEO", "AI Optimization"],
-      "entities": ["Google", "OpenAI"],
-      "recommendations": [
-        "Add more semantic keywords",
-        "Improve conversational flow"
-      ],
-      "optimization_opportunities": [
-        {
-          "type": "semantic_enhancement",
-          "description": "Add more related terms",
-          "priority": "high"
-        }
-      ]
-    }
-  ],
-  "enhancement_results": {
-    "original_content": "Sample content for enhancement.",
-    "analysis_date": "2024-06-01T12:00:00",
-    "clarity_score": 8.5,
-    "structure_score": 7.0,
-    "answerability_score": 9.0,
-    "keywords": ["example", "installation", "setup"],
-    "optimized_content": "Enhanced sample content.",
-    "improvements_made": ["Improved clarity", "Added keywords"]
-  },
-  "qa_results": [
-    {
-      "query": "What is SEO?",
-      "result": "SEO stands for Search Engine Optimization.",
-      "sources": [
-        {
-          "content": "SEO stands for Search Engine Optimization...",
-          "metadata": {"source": "example.com/page1"}
-        }
-      ]
-    }
-  ]
-}

requirements.txt CHANGED Viewed

@@ -8,9 +8,3 @@ faiss-cpu
 transformers
 sentence-transformers
 pypdf
-beautifulsoup4
-requests
-numpy
-pandas
-openpyxl
-torch

 transformers
 sentence-transformers
 pypdf

utils/chunker.py DELETED Viewed

@@ -1,1314 +0,0 @@
-"""
-Vector Chunking and RAG Module
-Handles document chunking, vector embeddings, and RAG question-answering
-"""
-import os
-import json
-import numpy as np
-from typing import Dict, Any, List, Optional, Tuple
-from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
-from langchain.schema import Document
-from langchain_community.vectorstores import FAISS, Chroma
-from langchain.chains import RetrievalQA, ConversationalRetrievalChain
-from langchain.memory import ConversationBufferMemory
-from langchain.prompts import PromptTemplate
-import tempfile
-import shutil
-class VectorChunker:
-    """Main class for document chunking and vector operations"""
-    def __init__(self, embeddings_model, chunk_size: int = 1000, chunk_overlap: int = 200):
-        self.embeddings = embeddings_model
-        self.chunk_size = chunk_size
-        self.chunk_overlap = chunk_overlap
-        self.setup_text_splitters()
-        self.vector_stores = {}  # Cache for vector stores
-    def setup_text_splitters(self):
-        """Initialize different text splitting strategies"""
-        # Default recursive splitter
-        self.recursive_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
-            length_function=len,
-            separators=["\n\n", "\n", " ", ""]
-        )
-        # Character-based splitter
-        self.character_splitter = CharacterTextSplitter(
-            chunk_size=self.chunk_size,
-            chunk_overlap=self.chunk_overlap,
-            separator="\n\n"
-        )
-        # Semantic splitter for better context preservation
-        self.semantic_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=800,  # Smaller chunks for better semantic coherence
-            chunk_overlap=150,
-            length_function=len,
-            separators=["\n\n", "\n", ". ", " ", ""]
-        )
-    def chunk_documents(self, documents: List[Document], strategy: str = "recursive") -> List[Document]:
-        """
-        Chunk documents using specified strategy
-        Args:
-            documents (List[Document]): List of documents to chunk
-            strategy (str): Chunking strategy ("recursive", "character", "semantic")
-        Returns:
-            List[Document]: List of chunked documents
-        """
-        try:
-            # Choose splitter based on strategy
-            if strategy == "character":
-                splitter = self.character_splitter
-            elif strategy == "semantic":
-                splitter = self.semantic_splitter
-            else:
-                splitter = self.recursive_splitter
-            # Split documents
-            chunked_docs = []
-            for doc in documents:
-                chunks = splitter.split_documents([doc])
-                # Add chunk metadata
-                for i, chunk in enumerate(chunks):
-                    chunk.metadata.update({
-                        'chunk_index': i,
-                        'total_chunks': len(chunks),
-                        'chunk_strategy': strategy,
-                        'original_source': doc.metadata.get('source', 'unknown'),
-                        'chunk_size': len(chunk.page_content),
-                        'chunk_word_count': len(chunk.page_content.split())
-                    })
-                chunked_docs.extend(chunks)
-            return chunked_docs
-        except Exception as e:
-            raise Exception(f"Document chunking failed: {str(e)}")
-    def create_vector_store(self, documents: List[Document], store_type: str = "faiss",
-                           persist_directory: Optional[str] = None) -> Any:
-        """
-        Create vector store from documents
-        Args:
-            documents (List[Document]): Documents to vectorize
-            store_type (str): Type of vector store ("faiss", "chroma")
-            persist_directory (str): Optional directory to persist the store
-        Returns:
-            Vector store instance
-        """
-        try:
-            if not documents:
-                raise ValueError("No documents provided for vector store creation")
-            if store_type.lower() == "chroma":
-                if persist_directory:
-                    vector_store = Chroma.from_documents(
-                        documents=documents,
-                        embedding=self.embeddings,
-                        persist_directory=persist_directory
-                    )
-                    vector_store.persist()
-                else:
-                    vector_store = Chroma.from_documents(
-                        documents=documents,
-                        embedding=self.embeddings
-                    )
-            else:  # Default to FAISS
-                vector_store = FAISS.from_documents(
-                    documents=documents,
-                    embedding=self.embeddings
-                )
-                # Save FAISS index if persist directory provided
-                if persist_directory:
-                    os.makedirs(persist_directory, exist_ok=True)
-                    vector_store.save_local(persist_directory)
-            return vector_store
-        except Exception as e:
-            raise Exception(f"Vector store creation failed: {str(e)}")
-    def create_qa_chain(self, documents: List[Document], llm, chain_type: str = "stuff") -> RetrievalQA:
-        """
-        Create a Question-Answering chain from documents
-        Args:
-            documents (List[Document]): Documents for the knowledge base
-            llm: Language model for answering questions
-            chain_type (str): Type of QA chain ("stuff", "map_reduce", "refine")
-        Returns:
-            RetrievalQA: Configured QA chain
-        """
-        try:
-            # Chunk documents
-            chunked_docs = self.chunk_documents(documents, strategy="semantic")
-            # Create vector store
-            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
-            # Create retriever
-            retriever = vector_store.as_retriever(
-                search_type="similarity",
-                search_kwargs={"k": 4}  # Retrieve top 4 most relevant chunks
-            )
-            # Custom prompt for GEO-focused QA
-            qa_prompt_template = """Use the following pieces of context to answer the question at the end.
-If you don't know the answer, just say that you don't know, don't try to make up an answer.
-Focus on providing clear, accurate, and complete answers that would be suitable for AI search engines.
-Context:
-{context}
-Question: {question}
-Answer:"""
-            qa_prompt = PromptTemplate(
-                template=qa_prompt_template,
-                input_variables=["context", "question"]
-            )
-            # Create QA chain
-            qa_chain = RetrievalQA.from_chain_type(
-                llm=llm,
-                chain_type=chain_type,
-                retriever=retriever,
-                return_source_documents=True,
-                chain_type_kwargs={"prompt": qa_prompt}
-            )
-            return qa_chain
-        except Exception as e:
-            raise Exception(f"QA chain creation failed: {str(e)}")
-    def create_conversational_chain(self, documents: List[Document], llm) -> ConversationalRetrievalChain:
-        """
-        Create a conversational retrieval chain with memory
-        Args:
-            documents (List[Document]): Documents for the knowledge base
-            llm: Language model for conversation
-        Returns:
-            ConversationalRetrievalChain: Configured conversational chain
-        """
-        try:
-            # Chunk documents
-            chunked_docs = self.chunk_documents(documents, strategy="semantic")
-            # Create vector store
-            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
-            # Create retriever
-            retriever = vector_store.as_retriever(
-                search_type="similarity",
-                search_kwargs={"k": 3}
-            )
-            # Create memory
-            memory = ConversationBufferMemory(
-                memory_key="chat_history",
-                return_messages=True,
-                output_key="answer"
-            )
-            # Custom prompt for conversational QA
-            condense_question_prompt = """Given the following conversation and a follow up question,
-rephrase the follow up question to be a standalone question that can be understood without the chat history.
-Chat History:
-{chat_history}
-Follow Up Input: {question}
-Standalone question:"""
-            # Create conversational chain
-            conv_chain = ConversationalRetrievalChain.from_llm(
-                llm=llm,
-                retriever=retriever,
-                memory=memory,
-                return_source_documents=True,
-                condense_question_prompt=PromptTemplate.from_template(condense_question_prompt)
-            )
-            return conv_chain
-        except Exception as e:
-            raise Exception(f"Conversational chain creation failed: {str(e)}")
-    def semantic_search(self, query: str, documents: List[Document], top_k: int = 5) -> List[Dict[str, Any]]:
-        """
-        Perform semantic search on documents
-        Args:
-            query (str): Search query
-            documents (List[Document]): Documents to search
-            top_k (int): Number of top results to return
-        Returns:
-            List[Dict]: Search results with scores
-        """
-        try:
-            # Chunk documents
-            chunked_docs = self.chunk_documents(documents, strategy="semantic")
-            # Create vector store
-            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
-            # Perform similarity search with scores
-            results = vector_store.similarity_search_with_score(query, k=top_k)
-            # Format results
-            formatted_results = []
-            for doc, score in results:
-                result = {
-                    'content': doc.page_content,
-                    'metadata': doc.metadata,
-                    'similarity_score': float(score),
-                    'relevance_rank': len(formatted_results) + 1
-                }
-                formatted_results.append(result)
-            return formatted_results
-        except Exception as e:
-            raise Exception(f"Semantic search failed: {str(e)}")
-    def analyze_document_similarity(self, documents: List[Document]) -> Dict[str, Any]:
-        """
-        Analyze similarity between documents
-        Args:
-            documents (List[Document]): Documents to analyze
-        Returns:
-            Dict: Similarity analysis results
-        """
-        try:
-            if len(documents) < 2:
-                return {'error': 'Need at least 2 documents for similarity analysis'}
-            # Chunk documents
-            chunked_docs = self.chunk_documents(documents, strategy="semantic")
-            # Create embeddings for each document
-            doc_embeddings = []
-            doc_metadata = []
-            for doc in chunked_docs:
-                # Get embedding for the document
-                embedding = self.embeddings.embed_query(doc.page_content)
-                doc_embeddings.append(embedding)
-                doc_metadata.append({
-                    'content_preview': doc.page_content[:200] + "...",
-                    'metadata': doc.metadata,
-                    'length': len(doc.page_content)
-                })
-            # Calculate pairwise similarities
-            similarities = []
-            embeddings_array = np.array(doc_embeddings)
-            for i in range(len(embeddings_array)):
-                for j in range(i + 1, len(embeddings_array)):
-                    # Calculate cosine similarity
-                    similarity = np.dot(embeddings_array[i], embeddings_array[j]) / (
-                        np.linalg.norm(embeddings_array[i]) * np.linalg.norm(embeddings_array[j])
-                    )
-                    similarities.append({
-                        'doc_1_index': i,
-                        'doc_2_index': j,
-                        'similarity_score': float(similarity),
-                        'doc_1_preview': doc_metadata[i]['content_preview'],
-                        'doc_2_preview': doc_metadata[j]['content_preview']
-                    })
-            # Sort by similarity score
-            similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
-            # Calculate statistics
-            similarity_scores = [s['similarity_score'] for s in similarities]
-            return {
-                'total_comparisons': len(similarities),
-                'average_similarity': np.mean(similarity_scores),
-                'max_similarity': max(similarity_scores),
-                'min_similarity': min(similarity_scores),
-                'similarity_distribution': {
-                    'high_similarity': len([s for s in similarity_scores if s > 0.8]),
-                    'medium_similarity': len([s for s in similarity_scores if 0.5 < s <= 0.8]),
-                    'low_similarity': len([s for s in similarity_scores if s <= 0.5])
-                },
-                'top_similar_pairs': similarities[:5],
-                'most_dissimilar_pairs': similarities[-3:]
-            }
-        except Exception as e:
-            return {'error': f"Similarity analysis failed: {str(e)}"}
-    def extract_key_passages(self, documents: List[Document], queries: List[str],
-                           passages_per_query: int = 3) -> Dict[str, List[Dict[str, Any]]]:
-        """
-        Extract key passages from documents based on multiple queries
-        Args:
-            documents (List[Document]): Documents to search
-            queries (List[str]): List of queries to search for
-            passages_per_query (int): Number of passages to extract per query
-        Returns:
-            Dict: Key passages organized by query
-        """
-        try:
-            # Chunk documents
-            chunked_docs = self.chunk_documents(documents, strategy="semantic")
-            # Create vector store
-            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
-            key_passages = {}
-            for query in queries:
-                # Search for relevant passages
-                results = vector_store.similarity_search_with_score(query, k=passages_per_query)
-                passages = []
-                for doc, score in results:
-                    passage = {
-                        'content': doc.page_content,
-                        'relevance_score': float(score),
-                        'metadata': doc.metadata,
-                        'word_count': len(doc.page_content.split()),
-                        'query_match': query
-                    }
-                    passages.append(passage)
-                key_passages[query] = passages
-            return key_passages
-        except Exception as e:
-            return {'error': f"Key passage extraction failed: {str(e)}"}
-    def optimize_chunking_strategy(self, documents: List[Document],
-                                  test_queries: List[str]) -> Dict[str, Any]:
-        """
-        Test different chunking strategies and recommend the best one
-        Args:
-            documents (List[Document]): Documents to test
-            test_queries (List[str]): Queries to test retrieval performance
-        Returns:
-            Dict: Optimization results and recommendations
-        """
-        try:
-            strategies = ["recursive", "character", "semantic"]
-            strategy_results = {}
-            for strategy in strategies:
-                try:
-                    # Test this strategy
-                    chunked_docs = self.chunk_documents(documents, strategy=strategy)
-                    vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
-                    # Test retrieval performance
-                    retrieval_scores = []
-                    for query in test_queries:
-                        results = vector_store.similarity_search_with_score(query, k=3)
-                        # Calculate average relevance score
-                        if results:
-                            avg_score = sum(score for _, score in results) / len(results)
-                            retrieval_scores.append(float(avg_score))
-                    # Calculate strategy metrics
-                    avg_retrieval_score = np.mean(retrieval_scores) if retrieval_scores else 0
-                    total_chunks = len(chunked_docs)
-                    avg_chunk_size = np.mean([len(doc.page_content) for doc in chunked_docs])
-                    strategy_results[strategy] = {
-                        'average_retrieval_score': avg_retrieval_score,
-                        'total_chunks': total_chunks,
-                        'average_chunk_size': avg_chunk_size,
-                        'retrieval_scores': retrieval_scores,
-                        'chunk_size_distribution': {
-                            'min': min(len(doc.page_content) for doc in chunked_docs),
-                            'max': max(len(doc.page_content) for doc in chunked_docs),
-                            'std': float(np.std([len(doc.page_content) for doc in chunked_docs]))
-                        }
-                    }
-                except Exception as e:
-                    strategy_results[strategy] = {'error': f"Strategy test failed: {str(e)}"}
-            # Determine best strategy
-            valid_strategies = {k: v for k, v in strategy_results.items() if 'error' not in v}
-            if valid_strategies:
-                best_strategy = max(valid_strategies.keys(),
-                                  key=lambda k: valid_strategies[k]['average_retrieval_score'])
-                recommendation = {
-                    'recommended_strategy': best_strategy,
-                    'reason': f"Best average retrieval score: {valid_strategies[best_strategy]['average_retrieval_score']:.4f}",
-                    'all_results': strategy_results,
-                    'performance_summary': {
-                        strategy: result.get('average_retrieval_score', 0)
-                        for strategy, result in valid_strategies.items()
-                    }
-                }
-            else:
-                recommendation = {
-                    'recommended_strategy': 'recursive',  # Default fallback
-                    'reason': 'All strategies failed, using default',
-                    'all_results': strategy_results
-                }
-            return recommendation
-        except Exception as e:
-            return {'error': f"Chunking optimization failed: {str(e)}"}
-    def create_document_summary(self, documents: List[Document], llm,
-                               summary_type: str = "extractive") -> Dict[str, Any]:
-        """
-        Create document summaries using the chunked content
-        Args:
-            documents (List[Document]): Documents to summarize
-            llm: Language model for summarization
-            summary_type (str): Type of summary ("extractive", "abstractive")
-        Returns:
-            Dict: Summary results
-        """
-        try:
-            # Chunk documents for better processing
-            chunked_docs = self.chunk_documents(documents, strategy="semantic")
-            if summary_type == "extractive":
-                # Extract key sentences/chunks
-                return self._create_extractive_summary(chunked_docs)
-            else:
-                # Generate abstractive summary using LLM
-                return self._create_abstractive_summary(chunked_docs, llm)
-        except Exception as e:
-            return {'error': f"Document summarization failed: {str(e)}"}
-    def _create_extractive_summary(self, chunked_docs: List[Document]) -> Dict[str, Any]:
-        """Create extractive summary by selecting key chunks"""
-        try:
-            # Simple extractive approach: select chunks with highest semantic density
-            chunk_scores = []
-            for doc in chunked_docs:
-                content = doc.page_content
-                # Simple scoring based on content characteristics
-                word_count = len(content.split())
-                sentence_count = len([s for s in content.split('.') if s.strip()])
-                # Score based on information density
-                density_score = word_count / max(sentence_count, 1)
-                # Bonus for chunks with questions, definitions, or lists
-                structure_bonus = 0
-                if '?' in content:
-                    structure_bonus += 1
-                if any(word in content.lower() for word in ['define', 'definition', 'means', 'refers to']):
-                    structure_bonus += 2
-                if content.count('\n•') > 0 or content.count('1.') > 0:
-                    structure_bonus += 1
-                total_score = density_score + structure_bonus
-                chunk_scores.append((doc, total_score))
-            # Sort by score and select top chunks for summary
-            chunk_scores.sort(key=lambda x: x[1], reverse=True)
-            top_chunks = chunk_scores[:min(5, len(chunk_scores))]
-            summary_content = []
-            for doc, score in top_chunks:
-                summary_content.append({
-                    'content': doc.page_content,
-                    'score': score,
-                    'metadata': doc.metadata
-                })
-            return {
-                'summary_type': 'extractive',
-                'key_chunks': summary_content,
-                'total_chunks_analyzed': len(chunked_docs),
-                'chunks_selected': len(top_chunks)
-            }
-        except Exception as e:
-            return {'error': f"Extractive summary failed: {str(e)}"}
-    def _create_abstractive_summary(self, chunked_docs: List[Document], llm) -> Dict[str, Any]:
-        """Create abstractive summary using language model"""
-        try:
-            # Combine content from top chunks
-            combined_content = "\n\n".join([doc.page_content for doc in chunked_docs[:10]])
-            summary_prompt = f"""Please provide a comprehensive summary of the following content.
-Focus on the main topics, key insights, and important details that would be valuable for AI search engines.
-Content:
-{combined_content[:5000]}
-Summary:"""
-            from langchain.prompts import ChatPromptTemplate
-            prompt_template = ChatPromptTemplate.from_messages([
-                ("system", "You are a professional content summarizer. Create clear, informative summaries."),
-                ("user", summary_prompt)
-            ])
-            chain = prompt_template | llm
-            result = chain.invoke({})
-            summary_text = result.content if hasattr(result, 'content') else str(result)
-            return {
-                'summary_type': 'abstractive',
-                'summary': summary_text,
-                'source_chunks': len(chunked_docs),
-                'content_length_processed': len(combined_content)
-            }
-        except Exception as e:
-            return {'error': f"Abstractive summary failed: {str(e)}"}
-    def save_vector_store(self, vector_store, directory_path: str, store_type: str = "faiss") -> bool:
-        """
-        Save vector store to disk
-        Args:
-            vector_store: Vector store instance to save
-            directory_path (str): Directory to save the store
-            store_type (str): Type of vector store
-        Returns:
-            bool: Success status
-        """
-        try:
-            os.makedirs(directory_path, exist_ok=True)
-            if store_type.lower() == "faiss":
-                vector_store.save_local(directory_path)
-            elif store_type.lower() == "chroma":
-                # Chroma stores are typically persisted during creation
-                pass
-            return True
-        except Exception as e:
-            print(f"Failed to save vector store: {str(e)}")
-            return False
-    def load_vector_store(self, directory_path: str, store_type: str = "faiss"):
-        """
-        Load vector store from disk
-        Args:
-            directory_path (str): Directory containing the saved store
-            store_type (str): Type of vector store
-        Returns:
-            Vector store instance or None if failed
-        """
-        try:
-            if not os.path.exists(directory_path):
-                return None
-            if store_type.lower() == "faiss":
-                vector_store = FAISS.load_local(
-                    directory_path,
-                    self.embeddings,
-                    allow_dangerous_deserialization=True
-                )
-                return vector_store
-            elif store_type.lower() == "chroma":
-                vector_store = Chroma(
-                    persist_directory=directory_path,
-                    embedding_function=self.embeddings
-                )
-                return vector_store
-            return None
-        except Exception as e:
-            print(f"Failed to load vector store: {str(e)}")
-            return None
-    def get_chunking_stats(self, documents: List[Document], strategy: str = "recursive") -> Dict[str, Any]:
-        """
-        Get detailed statistics about document chunking
-        Args:
-            documents (List[Document]): Documents to analyze
-            strategy (str): Chunking strategy to use
-        Returns:
-            Dict: Detailed chunking statistics
-        """
-        try:
-            # Chunk documents
-            chunked_docs = self.chunk_documents(documents, strategy=strategy)
-            # Calculate statistics
-            chunk_sizes = [len(doc.page_content) for doc in chunked_docs]
-            word_counts = [len(doc.page_content.split()) for doc in chunked_docs]
-            stats = {
-                'strategy_used': strategy,
-                'original_documents': len(documents),
-                'total_chunks': len(chunked_docs),
-                'chunk_size_stats': {
-                    'min': min(chunk_sizes) if chunk_sizes else 0,
-                    'max': max(chunk_sizes) if chunk_sizes else 0,
-                    'mean': np.mean(chunk_sizes) if chunk_sizes else 0,
-                    'median': np.median(chunk_sizes) if chunk_sizes else 0,
-                    'std': np.std(chunk_sizes) if chunk_sizes else 0
-                },
-                'word_count_stats': {
-                    'min': min(word_counts) if word_counts else 0,
-                    'max': max(word_counts) if word_counts else 0,
-                    'mean': np.mean(word_counts) if word_counts else 0,
-                    'median': np.median(word_counts) if word_counts else 0,
-                    'std': np.std(word_counts) if word_counts else 0
-                },
-                'chunk_distribution': {
-                    'very_small': len([s for s in chunk_sizes if s < 200]),
-                    'small': len([s for s in chunk_sizes if 200 <= s < 500]),
-                    'medium': len([s for s in chunk_sizes if 500 <= s < 1000]),
-                    'large': len([s for s in chunk_sizes if 1000 <= s < 2000]),
-                    'very_large': len([s for s in chunk_sizes if s >= 2000])
-                },
-                'overlap_efficiency': self._calculate_overlap_efficiency(chunked_docs),
-                'content_coverage': self._calculate_content_coverage(documents, chunked_docs)
-            }
-            return stats
-        except Exception as e:
-            return {'error': f"Chunking statistics failed: {str(e)}"}
-    def _calculate_overlap_efficiency(self, chunked_docs: List[Document]) -> float:
-        """Calculate efficiency of chunk overlaps"""
-        try:
-            if len(chunked_docs) < 2:
-                return 1.0
-            total_content_length = sum(len(doc.page_content) for doc in chunked_docs)
-            unique_content = set()
-            # Rough estimate of content uniqueness
-            for doc in chunked_docs:
-                words = doc.page_content.split()
-                for i in range(0, len(words), 10):  # Sample every 10th word
-                    unique_content.add(' '.join(words[i:i+10]))
-            # Efficiency as ratio of unique content to total content
-            efficiency = len(unique_content) * 10 / total_content_length if total_content_length > 0 else 0
-            return min(efficiency, 1.0)
-        except Exception:
-            return 0.5  # Default neutral efficiency
-    def _calculate_content_coverage(self, original_docs: List[Document],
-                                   chunked_docs: List[Document]) -> float:
-        """Calculate how well chunks cover original content"""
-        try:
-            original_content = ' '.join([doc.page_content for doc in original_docs])
-            chunked_content = ' '.join([doc.page_content for doc in chunked_docs])
-            # Simple coverage metric based on length
-            coverage = len(chunked_content) / len(original_content) if original_content else 0
-            return min(coverage, 1.0)
-        except Exception:
-            return 0.0
-class ChunkingOptimizer:
-    """Helper class for optimizing chunking parameters"""
-    def __init__(self, embeddings_model):
-        self.embeddings = embeddings_model
-    def optimize_chunk_size(self, documents: List[Document], test_queries: List[str],
-                           size_range: Tuple[int, int] = (200, 2000),
-                           step_size: int = 200) -> Dict[str, Any]:
-        """
-        Find optimal chunk size for given documents and queries
-        Args:
-            documents (List[Document]): Documents to test
-            test_queries (List[str]): Queries for testing retrieval
-            size_range (Tuple[int, int]): Range of chunk sizes to test
-            step_size (int): Step size for testing
-        Returns:
-            Dict: Optimization results with recommended chunk size
-        """
-        try:
-            results = {}
-            min_size, max_size = size_range
-            for chunk_size in range(min_size, max_size + 1, step_size):
-                # Test this chunk size
-                chunker = VectorChunker(self.embeddings, chunk_size=chunk_size)
-                try:
-                    chunked_docs = chunker.chunk_documents(documents)
-                    vector_store = chunker.create_vector_store(chunked_docs)
-                    # Test retrieval performance
-                    retrieval_scores = []
-                    for query in test_queries:
-                        search_results = vector_store.similarity_search_with_score(query, k=3)
-                        if search_results:
-                            avg_score = sum(score for _, score in search_results) / len(search_results)
-                            retrieval_scores.append(float(avg_score))
-                    avg_performance = np.mean(retrieval_scores) if retrieval_scores else 0
-                    results[chunk_size] = {
-                        'average_retrieval_score': avg_performance,
-                        'total_chunks': len(chunked_docs),
-                        'retrieval_scores': retrieval_scores
-                    }
-                except Exception as e:
-                    results[chunk_size] = {'error': str(e)}
-            # Find optimal chunk size
-            valid_results = {k: v for k, v in results.items() if 'error' not in v}
-            if valid_results:
-                optimal_size = max(valid_results.keys(),
-                                 key=lambda k: valid_results[k]['average_retrieval_score'])
-                return {
-                    'optimal_chunk_size': optimal_size,
-                    'optimal_performance': valid_results[optimal_size]['average_retrieval_score'],
-                    'all_results': results,
-                    'performance_trend': self._analyze_performance_trend(valid_results),
-                    'recommendation': f"Use chunk size {optimal_size} for best retrieval performance"
-                }
-            else:
-                return {
-                    'error': 'No valid chunk sizes could be tested',
-                    'all_results': results
-                }
-        except Exception as e:
-            return {'error': f"Chunk size optimization failed: {str(e)}"}
-    def _analyze_performance_trend(self, results: Dict[int, Dict[str, Any]]) -> Dict[str, Any]:
-        """Analyze performance trend across different chunk sizes"""
-        try:
-            sizes = sorted(results.keys())
-            performances = [results[size]['average_retrieval_score'] for size in sizes]
-            # Find trend direction
-            if len(performances) >= 2:
-                trend_direction = "increasing" if performances[-1] > performances[0] else "decreasing"
-                peak_performance = max(performances)
-                peak_size = sizes[performances.index(peak_performance)]
-                return {
-                    'trend_direction': trend_direction,
-                    'peak_performance': peak_performance,
-                    'peak_size': peak_size,
-                    'performance_range': max(performances) - min(performances),
-                    'stable_performance': max(performances) - min(performances) < 0.1
-                }
-            else:
-                return {'error': 'Insufficient data for trend analysis'}
-        except Exception:
-            return {'error': 'Trend analysis failed'}
-class RAGPipeline:
-    """Complete RAG pipeline for document question-answering"""
-    def __init__(self, embeddings_model, llm):
-        self.embeddings = embeddings_model
-        self.llm = llm
-        self.chunker = VectorChunker(embeddings_model)
-        self.vector_stores = {}
-        self.qa_chains = {}
-    def create_pipeline(self, documents: List[Document], pipeline_id: str,
-                       chunking_strategy: str = "semantic") -> Dict[str, Any]:
-        """
-        Create a complete RAG pipeline for documents
-        Args:
-            documents (List[Document]): Documents to process
-            pipeline_id (str): Unique identifier for this pipeline
-            chunking_strategy (str): Strategy for document chunking
-        Returns:
-            Dict: Pipeline creation results
-        """
-        try:
-            # Step 1: Chunk documents
-            chunked_docs = self.chunker.chunk_documents(documents, strategy=chunking_strategy)
-            # Step 2: Create vector store
-            vector_store = self.chunker.create_vector_store(chunked_docs, store_type="faiss")
-            # Step 3: Create QA chain
-            qa_chain = self.chunker.create_qa_chain(documents, self.llm)
-            # Store pipeline components
-            self.vector_stores[pipeline_id] = vector_store
-            self.qa_chains[pipeline_id] = qa_chain
-            # Pipeline statistics
-            stats = {
-                'pipeline_id': pipeline_id,
-                'documents_processed': len(documents),
-                'chunks_created': len(chunked_docs),
-                'chunking_strategy': chunking_strategy,
-                'vector_store_type': 'faiss',
-                'embedding_model': str(self.embeddings),
-                'created_at': self._get_timestamp()
-            }
-            return {
-                'success': True,
-                'pipeline_stats': stats,
-                'chunking_info': self.chunker.get_chunking_stats(documents, chunking_strategy)
-            }
-        except Exception as e:
-            return {'error': f"Pipeline creation failed: {str(e)}"}
-    def query_pipeline(self, pipeline_id: str, query: str,
-                      return_sources: bool = True) -> Dict[str, Any]:
-        """
-        Query a created RAG pipeline
-        Args:
-            pipeline_id (str): ID of the pipeline to query
-            query (str): Question to ask
-            return_sources (bool): Whether to return source documents
-        Returns:
-            Dict: Query results with answer and sources
-        """
-        try:
-            if pipeline_id not in self.qa_chains:
-                return {'error': f"Pipeline '{pipeline_id}' not found"}
-            qa_chain = self.qa_chains[pipeline_id]
-            # Execute query
-            result = qa_chain({"query": query})
-            # Format response
-            response = {
-                'query': query,
-                'answer': result.get('result', 'No answer generated'),
-                'pipeline_id': pipeline_id,
-                'query_timestamp': self._get_timestamp()
-            }
-            # Add source documents if requested
-            if return_sources and 'source_documents' in result:
-                sources = []
-                for i, doc in enumerate(result['source_documents']):
-                    source = {
-                        'source_index': i,
-                        'content': doc.page_content,
-                        'metadata': doc.metadata,
-                        'relevance_rank': i + 1
-                    }
-                    sources.append(source)
-                response['sources'] = sources
-                response['num_sources'] = len(sources)
-            return response
-        except Exception as e:
-            return {'error': f"Pipeline query failed: {str(e)}"}
-    def batch_query_pipeline(self, pipeline_id: str, queries: List[str]) -> List[Dict[str, Any]]:
-        """
-        Execute multiple queries on a pipeline
-        Args:
-            pipeline_id (str): ID of the pipeline to query
-            queries (List[str]): List of questions to ask
-        Returns:
-            List[Dict]: List of query results
-        """
-        results = []
-        for i, query in enumerate(queries):
-            try:
-                result = self.query_pipeline(pipeline_id, query, return_sources=False)
-                result['batch_index'] = i
-                results.append(result)
-            except Exception as e:
-                results.append({
-                    'batch_index': i,
-                    'query': query,
-                    'error': f"Batch query failed: {str(e)}"
-                })
-        return results
-    def evaluate_pipeline(self, pipeline_id: str, test_queries: List[str],
-                         expected_answers: List[str] = None) -> Dict[str, Any]:
-        """
-        Evaluate pipeline performance on test queries
-        Args:
-            pipeline_id (str): ID of the pipeline to evaluate
-            test_queries (List[str]): Test questions
-            expected_answers (List[str]): Optional expected answers for comparison
-        Returns:
-            Dict: Evaluation results
-        """
-        try:
-            if pipeline_id not in self.qa_chains:
-                return {'error': f"Pipeline '{pipeline_id}' not found"}
-            evaluation_results = []
-            response_times = []
-            for i, query in enumerate(test_queries):
-                import time
-                start_time = time.time()
-                # Execute query
-                result = self.query_pipeline(pipeline_id, query, return_sources=True)
-                end_time = time.time()
-                response_time = end_time - start_time
-                response_times.append(response_time)
-                # Evaluate result
-                eval_result = {
-                    'query_index': i,
-                    'query': query,
-                    'answer_generated': not result.get('error'),
-                    'response_time': response_time,
-                    'answer_length': len(result.get('answer', '')),
-                    'sources_returned': result.get('num_sources', 0)
-                }
-                # If expected answer provided, calculate similarity
-                if expected_answers and i < len(expected_answers):
-                    expected = expected_answers[i]
-                    generated = result.get('answer', '')
-                    # Simple similarity metric
-                    similarity = self._calculate_answer_similarity(expected, generated)
-                    eval_result['answer_similarity'] = similarity
-                    eval_result['expected_answer'] = expected
-                evaluation_results.append(eval_result)
-            # Calculate aggregate metrics
-            successful_queries = len([r for r in evaluation_results if r['answer_generated']])
-            avg_response_time = np.mean(response_times) if response_times else 0
-            if expected_answers:
-                similarities = [r.get('answer_similarity', 0) for r in evaluation_results
-                               if 'answer_similarity' in r]
-                avg_similarity = np.mean(similarities) if similarities else 0
-            else:
-                avg_similarity = None
-            return {
-                'pipeline_id': pipeline_id,
-                'total_queries': len(test_queries),
-                'successful_queries': successful_queries,
-                'success_rate': successful_queries / len(test_queries) if test_queries else 0,
-                'average_response_time': avg_response_time,
-                'average_answer_similarity': avg_similarity,
-                'detailed_results': evaluation_results,
-                'evaluation_timestamp': self._get_timestamp()
-            }
-        except Exception as e:
-            return {'error': f"Pipeline evaluation failed: {str(e)}"}
-    def _calculate_answer_similarity(self, expected: str, generated: str) -> float:
-        """Calculate similarity between expected and generated answers"""
-        try:
-            # Simple word overlap similarity
-            expected_words = set(expected.lower().split())
-            generated_words = set(generated.lower().split())
-            if not expected_words and not generated_words:
-                return 1.0
-            intersection = expected_words.intersection(generated_words)
-            union = expected_words.union(generated_words)
-            return len(intersection) / len(union) if union else 0.0
-        except Exception:
-            return 0.0
-    def get_pipeline_info(self, pipeline_id: str) -> Dict[str, Any]:
-        """Get information about a specific pipeline"""
-        try:
-            if pipeline_id not in self.qa_chains:
-                return {'error': f"Pipeline '{pipeline_id}' not found"}
-            # Get vector store info
-            vector_store = self.vector_stores.get(pipeline_id)
-            if vector_store:
-                try:
-                    # Try to get vector store statistics
-                    total_vectors = vector_store.index.ntotal if hasattr(vector_store, 'index') else 'unknown'
-                except:
-                    total_vectors = 'unknown'
-            else:
-                total_vectors = 'unknown'
-            return {
-                'pipeline_id': pipeline_id,
-                'has_qa_chain': pipeline_id in self.qa_chains,
-                'has_vector_store': pipeline_id in self.vector_stores,
-                'total_vectors': total_vectors,
-                'embedding_model': str(self.embeddings),
-                'llm_model': str(self.llm)
-            }
-        except Exception as e:
-            return {'error': f"Failed to get pipeline info: {str(e)}"}
-    def list_pipelines(self) -> Dict[str, Any]:
-        """List all created pipelines"""
-        return {
-            'total_pipelines': len(self.qa_chains),
-            'pipeline_ids': list(self.qa_chains.keys()),
-            'vector_stores': list(self.vector_stores.keys())
-        }
-    def delete_pipeline(self, pipeline_id: str) -> Dict[str, Any]:
-        """Delete a pipeline and free resources"""
-        try:
-            deleted_components = []
-            if pipeline_id in self.qa_chains:
-                del self.qa_chains[pipeline_id]
-                deleted_components.append('qa_chain')
-            if pipeline_id in self.vector_stores:
-                del self.vector_stores[pipeline_id]
-                deleted_components.append('vector_store')
-            if deleted_components:
-                return {
-                    'success': True,
-                    'pipeline_id': pipeline_id,
-                    'deleted_components': deleted_components
-                }
-            else:
-                return {'error': f"Pipeline '{pipeline_id}' not found"}
-        except Exception as e:
-            return {'error': f"Pipeline deletion failed: {str(e)}"}
-    def export_pipeline_config(self, pipeline_id: str) -> Dict[str, Any]:
-        """Export pipeline configuration for recreation"""
-        try:
-            if pipeline_id not in self.qa_chains:
-                return {'error': f"Pipeline '{pipeline_id}' not found"}
-            config = {
-                'pipeline_id': pipeline_id,
-                'embedding_model_name': getattr(self.embeddings, 'model_name', 'unknown'),
-                'llm_model_name': getattr(self.llm, 'model_name', 'unknown'),
-                'chunker_config': {
-                    'chunk_size': self.chunker.chunk_size,
-                    'chunk_overlap': self.chunker.chunk_overlap
-                },
-                'export_timestamp': self._get_timestamp(),
-                'vector_store_type': 'faiss'
-            }
-            return config
-        except Exception as e:
-            return {'error': f"Pipeline export failed: {str(e)}"}
-    def _get_timestamp(self) -> str:
-        """Get current timestamp"""
-        from datetime import datetime
-        return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-# Utility functions for the module
-def optimize_rag_pipeline(documents: List[Document], embeddings_model, llm,
-                         test_queries: List[str]) -> Dict[str, Any]:
-    """
-    Optimize RAG pipeline configuration for given documents and queries
-    Args:
-        documents (List[Document]): Documents to optimize for
-        embeddings_model: Embedding model to use
-        llm: Language model to use
-        test_queries (List[str]): Test queries for optimization
-    Returns:
-        Dict: Optimization recommendations
-    """
-    try:
-        # Test different chunking strategies
-        chunker = VectorChunker(embeddings_model)
-        chunking_results = chunker.optimize_chunking_strategy(documents, test_queries)
-        # Test different chunk sizes
-        optimizer = ChunkingOptimizer(embeddings_model)
-        size_results = optimizer.optimize_chunk_size(documents, test_queries)
-        # Create optimized pipeline
-        best_strategy = chunking_results.get('recommended_strategy', 'semantic')
-        best_size = size_results.get('optimal_chunk_size', 1000)
-        # Create optimized chunker
-        optimized_chunker = VectorChunker(
-            embeddings_model,
-            chunk_size=best_size,
-            chunk_overlap=best_size // 5  # 20% overlap
-        )
-        # Test the optimized configuration
-        pipeline = RAGPipeline(embeddings_model, llm)
-        pipeline.chunker = optimized_chunker
-        test_pipeline_id = "optimization_test"
-        creation_result = pipeline.create_pipeline(documents, test_pipeline_id, best_strategy)
-        if not creation_result.get('error'):
-            evaluation_result = pipeline.evaluate_pipeline(test_pipeline_id, test_queries)
-            pipeline.delete_pipeline(test_pipeline_id)  # Clean up
-        else:
-            evaluation_result = {'error': 'Could not evaluate optimized pipeline'}
-        return {
-            'optimization_complete': True,
-            'recommended_config': {
-                'chunking_strategy': best_strategy,
-                'chunk_size': best_size,
-                'chunk_overlap': best_size // 5
-            },
-            'chunking_optimization': chunking_results,
-            'size_optimization': size_results,
-            'performance_evaluation': evaluation_result,
-            'recommendations': [
-                f"Use {best_strategy} chunking strategy",
-                f"Set chunk size to {best_size} characters",
-                f"Use {best_size // 5} character overlap",
-                "Monitor and adjust based on query performance"
-            ]
-        }
-    except Exception as e:
-        return {'error': f"RAG optimization failed: {str(e)}"}
-def create_demo_rag_system(sample_documents: List[Document], embeddings_model, llm) -> Dict[str, Any]:
-    """
-    Create a demonstration RAG system with sample documents
-    Args:
-        sample_documents (List[Document]): Sample documents for demo
-        embeddings_model: Embedding model
-        llm: Language model
-    Returns:
-        Dict: Demo system information and sample interactions
-    """
-    try:
-        # Create RAG pipeline
-        pipeline = RAGPipeline(embeddings_model, llm)
-        demo_id = "demo_system"
-        # Create the pipeline
-        creation_result = pipeline.create_pipeline(sample_documents, demo_id, "semantic")
-        if creation_result.get('error'):
-            return {'error': f"Demo system creation failed: {creation_result['error']}"}
-        # Sample queries for demonstration
-        demo_queries = [
-            "What is the main topic of these documents?",
-            "Can you summarize the key points?",
-            "What are the most important concepts mentioned?"
-        ]
-        # Execute demo queries
-        demo_results = []
-        for query in demo_queries:
-            result = pipeline.query_pipeline(demo_id, query, return_sources=True)
-            demo_results.append(result)
-        # Get system statistics
-        pipeline_info = pipeline.get_pipeline_info(demo_id)
-        return {
-            'demo_system_created': True,
-            'pipeline_id': demo_id,
-            'creation_stats': creation_result,
-            'pipeline_info': pipeline_info,
-            'demo_queries': demo_queries,
-            'demo_results': demo_results,
-            'usage_instructions': [
-                f"Use pipeline.query_pipeline('{demo_id}', 'your question') to ask questions",
-                "The system will return answers with source document references",
-                "Sources show which parts of the documents were used for the answer"
-            ]
-        }
-    except Exception as e:
-        return {'error': f"Demo system creation failed: {str(e)}"}
-# Export the main classes for use in other modules
-__all__ = [
-    'VectorChunker',
-    'ChunkingOptimizer',
-    'RAGPipeline',
-    'optimize_rag_pipeline',
-    'create_demo_rag_system'
-]

utils/export.py DELETED Viewed

@@ -1,1896 +0,0 @@
-"""
-Results Export and Reporting Module
-Handles export of analysis results, reports, and data for external use
-"""
-import json
-import csv
-import io
-import zipfile
-import tempfile
-import os
-from datetime import datetime
-from typing import Dict, Any, List, Optional, Union
-import pandas as pd
-from dataclasses import dataclass, asdict
-@dataclass
-class GEOReport:
-    """Data class for GEO analysis reports"""
-    website_url: str
-    analysis_date: str
-    overall_score: float
-    pages_analyzed: int
-    geo_scores: Dict[str, float]
-    recommendations: List[str]
-    optimization_opportunities: List[Dict[str, Any]]
-    competitive_position: str
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert report to dictionary"""
-        return asdict(self)
-@dataclass
-class ContentAnalysis:
-    """Data class for content optimization analysis"""
-    original_content: str
-    analysis_date: str
-    clarity_score: float
-    structure_score: float
-    answerability_score: float
-    keywords: List[str]
-    optimized_content: Optional[str]
-    improvements_made: List[str]
-    def to_dict(self) -> Dict[str, Any]:
-        """Convert analysis to dictionary"""
-        return asdict(self)
-class ResultExporter:
-    """Main class for exporting analysis results and generating reports"""
-    def __init__(self):
-        self.export_formats = ['json', 'csv', 'html', 'pdf', 'xlsx']
-        self.supported_types = ['geo_analysis', 'content_optimization', 'qa_results', 'batch_analysis']
-    def export_geo_results(self, geo_results: List[Dict[str, Any]],
-                          website_url: str, format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
-        """
-        Export GEO analysis results in specified format
-        Args:
-            geo_results (List[Dict]): List of GEO analysis results
-            website_url (str): URL of analyzed website
-            format_type (str): Export format ('json', 'csv', 'html', 'xlsx')
-        Returns:
-            Union[str, bytes, Dict]: Exported data in requested format
-        """
-        try:
-            # Prepare consolidated data
-            export_data = self._prepare_geo_export_data(geo_results, website_url)
-            if format_type.lower() == 'json':
-                return self._export_geo_json(export_data)
-            elif format_type.lower() == 'csv':
-                return self._export_geo_csv(export_data)
-            elif format_type.lower() == 'html':
-                return self._export_geo_html(export_data)
-            elif format_type.lower() == 'xlsx':
-                return self._export_geo_excel(export_data)
-            elif format_type.lower() == 'pdf':
-                return self._export_geo_pdf(export_data)
-            else:
-                raise ValueError(f"Unsupported export format: {format_type}")
-        except Exception as e:
-            return {'error': f"Export failed: {str(e)}"}
-    def export_enhancement_results(self, enhancement_result: Dict[str, Any],
-                                  format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
-        """
-        Export content enhancement results
-        Args:
-            enhancement_result (Dict): Content enhancement analysis result
-            format_type (str): Export format
-        Returns:
-            Union[str, bytes, Dict]: Exported data
-        """
-        try:
-            # Prepare data for export
-            export_data = self._prepare_enhancement_export_data(enhancement_result)
-            if format_type.lower() == 'json':
-                return json.dumps(export_data, indent=2, ensure_ascii=False)
-            elif format_type.lower() == 'html':
-                return self._export_enhancement_html(export_data)
-            elif format_type.lower() == 'csv':
-                return self._export_enhancement_csv(export_data)
-            else:
-                return json.dumps(export_data, indent=2, ensure_ascii=False)
-        except Exception as e:
-            return {'error': f"Enhancement export failed: {str(e)}"}
-    def export_qa_results(self, qa_results: List[Dict[str, Any]],
-                         format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
-        """
-        Export Q&A session results
-        Args:
-            qa_results (List[Dict]): List of Q&A interactions
-            format_type (str): Export format
-        Returns:
-            Union[str, bytes, Dict]: Exported data
-        """
-        try:
-            export_data = {
-                'qa_session': {
-                    'session_date': datetime.now().isoformat(),
-                    'total_questions': len(qa_results),
-                    'interactions': qa_results
-                },
-                'summary': {
-                    'successful_answers': len([r for r in qa_results if not r.get('error')]),
-                    'average_response_length': self._calculate_avg_response_length(qa_results),
-                    'most_common_topics': self._extract_common_topics(qa_results)
-                }
-            }
-            if format_type.lower() == 'json':
-                return json.dumps(export_data, indent=2, ensure_ascii=False)
-            elif format_type.lower() == 'html':
-                return self._export_qa_html(export_data)
-            elif format_type.lower() == 'csv':
-                return self._export_qa_csv(export_data)
-            else:
-                return json.dumps(export_data, indent=2, ensure_ascii=False)
-        except Exception as e:
-            return {'error': f"Q&A export failed: {str(e)}"}
-    def create_comprehensive_report(self, analysis_data: Dict[str, Any],
-                                   report_type: str = 'full') -> Dict[str, Any]:
-        """
-        Create comprehensive analysis report
-        Args:
-            analysis_data (Dict): Combined analysis data from multiple sources
-            report_type (str): Type of report ('full', 'summary', 'executive')
-        Returns:
-            Dict: Comprehensive report data
-        """
-        try:
-            report = {
-                'report_metadata': {
-                    'generated_at': datetime.now().isoformat(),
-                    'report_type': report_type,
-                    'generator': 'GEO SEO AI Optimizer',
-                    'version': '1.0'
-                }
-            }
-            if report_type == 'executive':
-                report.update(self._create_executive_summary(analysis_data))
-            elif report_type == 'summary':
-                report.update(self._create_summary_report(analysis_data))
-            else:  # full report
-                report.update(self._create_full_report(analysis_data))
-            return report
-        except Exception as e:
-            return {'error': f"Report creation failed: {str(e)}"}
-    def export_batch_results(self, batch_results: List[Dict[str, Any]],
-                           batch_metadata: Dict[str, Any],
-                           format_type: str = 'xlsx') -> Union[str, bytes, Dict[str, Any]]:
-        """
-        Export batch analysis results
-        Args:
-            batch_results (List[Dict]): List of batch analysis results
-            batch_metadata (Dict): Metadata about the batch process
-            format_type (str): Export format
-        Returns:
-            Union[str, bytes, Dict]: Exported batch data
-        """
-        try:
-            export_data = {
-                'batch_metadata': batch_metadata,
-                'batch_results': batch_results,
-                'batch_summary': self._create_batch_summary(batch_results),
-                'export_timestamp': datetime.now().isoformat()
-            }
-            if format_type.lower() == 'xlsx':
-                return self._export_batch_excel(export_data)
-            elif format_type.lower() == 'json':
-                return json.dumps(export_data, indent=2, ensure_ascii=False)
-            elif format_type.lower() == 'csv':
-                return self._export_batch_csv(export_data)
-            else:
-                return json.dumps(export_data, indent=2, ensure_ascii=False)
-        except Exception as e:
-            return {'error': f"Batch export failed: {str(e)}"}
-    def create_export_package(self, analysis_data: Dict[str, Any],
-                             package_name: str = "geo_analysis") -> bytes:
-        """
-        Create a ZIP package with multiple export formats
-        Args:
-            analysis_data (Dict): Analysis data to package
-            package_name (str): Name for the package
-        Returns:
-            bytes: ZIP file content
-        """
-        try:
-            # Create temporary directory
-            with tempfile.TemporaryDirectory() as temp_dir:
-                zip_path = os.path.join(temp_dir, f"{package_name}.zip")
-                with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
-                    # Add JSON export
-                    json_data = json.dumps(analysis_data, indent=2, ensure_ascii=False)
-                    zip_file.writestr(f"{package_name}.json", json_data)
-                    # Add HTML report
-                    if 'geo_results' in analysis_data:
-                        html_data = self._export_geo_html(analysis_data)
-                        zip_file.writestr(f"{package_name}_report.html", html_data)
-                    # Add CSV data
-                    if 'geo_results' in analysis_data:
-                        csv_data = self._export_geo_csv(analysis_data)
-                        zip_file.writestr(f"{package_name}_data.csv", csv_data)
-                    # Add README
-                    readme_content = self._generate_package_readme(analysis_data)
-                    zip_file.writestr("README.txt", readme_content)
-                # Read the ZIP file
-                with open(zip_path, 'rb') as zip_file:
-                    return zip_file.read()
-        except Exception as e:
-            raise Exception(f"Package creation failed: {str(e)}")
-    def _prepare_geo_export_data(self, geo_results: List[Dict[str, Any]], website_url: str) -> Dict[str, Any]:
-        """Prepare GEO data for export"""
-        try:
-            # Calculate aggregate metrics
-            valid_results = [r for r in geo_results if 'geo_scores' in r and not r.get('error')]
-            if not valid_results:
-                return {
-                    'error': 'No valid GEO results to export',
-                    'website_url': website_url,
-                    'export_timestamp': datetime.now().isoformat()
-                }
-            # Aggregate scores
-            all_scores = {}
-            for result in valid_results:
-                for metric, score in result.get('geo_scores', {}).items():
-                    if metric not in all_scores:
-                        all_scores[metric] = []
-                    all_scores[metric].append(score)
-            avg_scores = {metric: sum(scores) / len(scores) for metric, scores in all_scores.items()}
-            overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
-            # Collect recommendations
-            all_recommendations = []
-            all_opportunities = []
-            for result in valid_results:
-                all_recommendations.extend(result.get('recommendations', []))
-                all_opportunities.extend(result.get('optimization_opportunities', []))
-            # Remove duplicates
-            unique_recommendations = list(set(all_recommendations))
-            return {
-                'website_analysis': {
-                    'url': website_url,
-                    'analysis_date': datetime.now().isoformat(),
-                    'pages_analyzed': len(valid_results),
-                    'overall_geo_score': round(overall_avg, 2)
-                },
-                'aggregate_scores': avg_scores,
-                'individual_page_results': valid_results,
-                'recommendations': unique_recommendations[:10],  # Top 10
-                'optimization_opportunities': all_opportunities,
-                'performance_insights': self._generate_performance_insights(avg_scores, overall_avg),
-                'export_metadata': {
-                    'exported_by': 'GEO SEO AI Optimizer',
-                    'export_timestamp': datetime.now().isoformat(),
-                    'data_format': 'GEO Analysis Results v1.0'
-                }
-            }
-        except Exception as e:
-            return {'error': f"Data preparation failed: {str(e)}"}
-    def _prepare_enhancement_export_data(self, enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
-        """Prepare content enhancement data for export"""
-        try:
-            scores = enhancement_result.get('scores', {})
-            return {
-                'content_analysis': {
-                    'analysis_date': datetime.now().isoformat(),
-                    'original_content_length': enhancement_result.get('original_length', 0),
-                    'original_word_count': enhancement_result.get('original_word_count', 0),
-                    'analysis_type': enhancement_result.get('optimization_type', 'standard')
-                },
-                'performance_scores': {
-                    'clarity': scores.get('clarity', 0),
-                    'structure': scores.get('structuredness', 0),
-                    'answerability': scores.get('answerability', 0),
-                    'overall_average': sum(scores.values()) / len(scores) if scores else 0
-                },
-                'optimization_results': {
-                    'keywords_identified': enhancement_result.get('keywords', []),
-                    'optimized_content': enhancement_result.get('optimized_text', ''),
-                    'improvements_made': enhancement_result.get('optimization_suggestions', []),
-                    'analyze_only': enhancement_result.get('analyze_only', False)
-                },
-                'export_metadata': {
-                    'exported_by': 'GEO SEO AI Optimizer',
-                    'export_timestamp': datetime.now().isoformat(),
-                    'data_format': 'Content Enhancement Results v1.0'
-                }
-            }
-        except Exception as e:
-            return {'error': f"Enhancement data preparation failed: {str(e)}"}
-    def _export_geo_json(self, data: Dict[str, Any]) -> str:
-        """Export GEO data as JSON"""
-        return json.dumps(data, indent=2, ensure_ascii=False)
-    def _export_geo_csv(self, data: Dict[str, Any]) -> str:
-        """Export GEO data as CSV"""
-        try:
-            output = io.StringIO()
-            # Write aggregate scores
-            writer = csv.writer(output)
-            writer.writerow(['GEO Analysis Results'])
-            writer.writerow(['Website:', data.get('website_analysis', {}).get('url', 'Unknown')])
-            writer.writerow(['Analysis Date:', data.get('website_analysis', {}).get('analysis_date', 'Unknown')])
-            writer.writerow(['Overall Score:', data.get('website_analysis', {}).get('overall_geo_score', 0)])
-            writer.writerow([])
-            # Write aggregate scores
-            writer.writerow(['Metric', 'Score'])
-            for metric, score in data.get('aggregate_scores', {}).items():
-                writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
-            writer.writerow([])
-            writer.writerow(['Recommendations'])
-            for i, rec in enumerate(data.get('recommendations', []), 1):
-                writer.writerow([f"{i}.", rec])
-            # Individual page results
-            if data.get('individual_page_results'):
-                writer.writerow([])
-                writer.writerow(['Individual Page Results'])
-                # Header for page results
-                first_result = data['individual_page_results'][0]
-                if 'geo_scores' in first_result:
-                    headers = ['Page Index', 'Page URL', 'Page Title'] + list(first_result['geo_scores'].keys())
-                    writer.writerow(headers)
-                    for i, result in enumerate(data['individual_page_results']):
-                        page_data = result.get('page_data', {})
-                        scores = result.get('geo_scores', {})
-                        row = [
-                            i + 1,
-                            page_data.get('url', 'Unknown'),
-                            page_data.get('title', 'Unknown')
-                        ] + [round(scores.get(metric, 0), 2) for metric in headers[3:]]
-                        writer.writerow(row)
-            return output.getvalue()
-        except Exception as e:
-            return f"CSV export error: {str(e)}"
-    def _export_geo_html(self, data: Dict[str, Any]) -> str:
-        """Export GEO data as HTML report"""
-        try:
-            website_info = data.get('website_analysis', {})
-            scores = data.get('aggregate_scores', {})
-            recommendations = data.get('recommendations', [])
-            html_content = f"""
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>GEO Analysis Report - {website_info.get('url', 'Website')}</title>
-    <style>
-        body {{
-            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-            line-height: 1.6;
-            color: #333;
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 20px;
-            background-color: #f5f5f5;
-        }}
-        .header {{
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            color: white;
-            padding: 30px;
-            border-radius: 10px;
-            margin-bottom: 30px;
-            text-align: center;
-        }}
-        .header h1 {{
-            margin: 0;
-            font-size: 2.5em;
-        }}
-        .summary-cards {{
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
-            gap: 20px;
-            margin-bottom: 30px;
-        }}
-        .card {{
-            background: white;
-            padding: 20px;
-            border-radius: 10px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-            text-align: center;
-        }}
-        .card h3 {{
-            margin-top: 0;
-            color: #667eea;
-        }}
-        .score {{
-            font-size: 2em;
-            font-weight: bold;
-            color: #333;
-        }}
-        .scores-grid {{
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-            gap: 20px;
-            margin-bottom: 30px;
-        }}
-        .score-item {{
-            background: white;
-            padding: 15px;
-            border-radius: 8px;
-            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-            display: flex;
-            justify-content: space-between;
-            align-items: center;
-        }}
-        .score-bar {{
-            width: 100px;
-            height: 10px;
-            background: #e0e0e0;
-            border-radius: 5px;
-            overflow: hidden;
-        }}
-        .score-fill {{
-            height: 100%;
-            background: linear-gradient(90deg, #ff6b6b, #ffa500, #4ecdc4);
-            transition: width 0.3s ease;
-        }}
-        .recommendations {{
-            background: white;
-            padding: 30px;
-            border-radius: 10px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-            margin-bottom: 30px;
-        }}
-        .recommendations h2 {{
-            color: #667eea;
-            border-bottom: 2px solid #667eea;
-            padding-bottom: 10px;
-        }}
-        .rec-item {{
-            padding: 10px 0;
-            border-bottom: 1px solid #eee;
-        }}
-        .footer {{
-            text-align: center;
-            color: #666;
-            margin-top: 40px;
-            padding-top: 20px;
-            border-top: 1px solid #ddd;
-        }}
-    </style>
-</head>
-<body>
-    <div class="header">
-        <h1>🚀 GEO Analysis Report</h1>
-        <p>Generative Engine Optimization Performance Analysis</p>
-        <p><strong>Website:</strong> {website_info.get('url', 'Not specified')}</p>
-        <p><strong>Analysis Date:</strong> {website_info.get('analysis_date', 'Not specified')}</p>
-    </div>
-    <div class="summary-cards">
-        <div class="card">
-            <h3>Overall GEO Score</h3>
-            <div class="score">{website_info.get('overall_geo_score', 0)}/10</div>
-        </div>
-        <div class="card">
-            <h3>Pages Analyzed</h3>
-            <div class="score">{website_info.get('pages_analyzed', 0)}</div>
-        </div>
-        <div class="card">
-            <h3>Recommendations</h3>
-            <div class="score">{len(recommendations)}</div>
-        </div>
-    </div>
-    <h2>📊 Detailed GEO Metrics</h2>
-    <div class="scores-grid">
-    """
-            # Add individual scores
-            for metric, score in scores.items():
-                metric_display = metric.replace('_', ' ').title()
-                score_percentage = min(score * 10, 100)  # Convert to percentage
-                html_content += f"""
-        <div class="score-item">
-            <div>
-                <strong>{metric_display}</strong><br>
-                <span style="color: #666;">{score:.1f}/10</span>
-            </div>
-            <div class="score-bar">
-                <div class="score-fill" style="width: {score_percentage}%;"></div>
-            </div>
-        </div>
-                """
-            html_content += """
-    </div>
-    <div class="recommendations">
-        <h2>💡 Optimization Recommendations</h2>
-    """
-            # Add recommendations
-            for i, rec in enumerate(recommendations, 1):
-                html_content += f'<div class="rec-item"><strong>{i}.</strong> {rec}</div>'
-            html_content += f"""
-    </div>
-    <div class="footer">
-        <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
-        <p>This report provides AI-first SEO optimization insights for better generative engine performance.</p>
-    </div>
-</body>
-</html>
-            """
-            return html_content
-        except Exception as e:
-            return f"<html><body><h1>HTML Export Error</h1><p>{str(e)}</p></body></html>"
-    def _export_geo_excel(self, data: Dict[str, Any]) -> bytes:
-        """Export GEO data as Excel file"""
-        try:
-            output = io.BytesIO()
-            with pd.ExcelWriter(output, engine='openpyxl') as writer:
-                # Summary sheet
-                summary_data = {
-                    'Metric': ['Website URL', 'Analysis Date', 'Pages Analyzed', 'Overall Score'],
-                    'Value': [
-                        data.get('website_analysis', {}).get('url', 'Unknown'),
-                        data.get('website_analysis', {}).get('analysis_date', 'Unknown'),
-                        data.get('website_analysis', {}).get('pages_analyzed', 0),
-                        data.get('website_analysis', {}).get('overall_geo_score', 0)
-                    ]
-                }
-                pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
-                # Scores sheet
-                scores_data = []
-                for metric, score in data.get('aggregate_scores', {}).items():
-                    scores_data.append({
-                        'Metric': metric.replace('_', ' ').title(),
-                        'Score': round(score, 2),
-                        'Performance': self._get_performance_level(score)
-                    })
-                pd.DataFrame(scores_data).to_excel(writer, sheet_name='GEO Scores', index=False)
-                # Recommendations sheet
-                rec_data = []
-                for i, rec in enumerate(data.get('recommendations', []), 1):
-                    rec_data.append({
-                        'Priority': i,
-                        'Recommendation': rec,
-                        'Category': self._categorize_recommendation(rec)
-                    })
-                if rec_data:
-                    pd.DataFrame(rec_data).to_excel(writer, sheet_name='Recommendations', index=False)
-                # Individual pages sheet
-                if data.get('individual_page_results'):
-                    pages_data = []
-                    for i, result in enumerate(data['individual_page_results']):
-                        page_data = result.get('page_data', {})
-                        scores = result.get('geo_scores', {})
-                        page_row = {
-                            'Page_Index': i + 1,
-                            'URL': page_data.get('url', 'Unknown'),
-                            'Title': page_data.get('title', 'Unknown'),
-                            'Word_Count': page_data.get('word_count', 0)
-                        }
-                        # Add all GEO scores
-                        for metric, score in scores.items():
-                            page_row[metric.replace('_', ' ').title()] = round(score, 2)
-                        pages_data.append(page_row)
-                    pd.DataFrame(pages_data).to_excel(writer, sheet_name='Individual Pages', index=False)
-            output.seek(0)
-            return output.getvalue()
-        except Exception as e:
-            # Return error as text file if Excel creation fails
-            error_content = f"Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
-            return error_content.encode('utf-8')
-    def _export_enhancement_html(self, data: Dict[str, Any]) -> str:
-        """Export content enhancement results as HTML"""
-        try:
-            analysis = data.get('content_analysis', {})
-            scores = data.get('performance_scores', {})
-            optimization = data.get('optimization_results', {})
-            html_content = f"""
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Content Enhancement Report</title>
-    <style>
-        body {{
-            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-            line-height: 1.6;
-            color: #333;
-            max-width: 1000px;
-            margin: 0 auto;
-            padding: 20px;
-            background-color: #f8f9fa;
-        }}
-        .header {{
-            background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
-            color: white;
-            padding: 30px;
-            border-radius: 10px;
-            margin-bottom: 30px;
-            text-align: center;
-        }}
-        .scores {{
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-            gap: 20px;
-            margin-bottom: 30px;
-        }}
-        .score-card {{
-            background: white;
-            padding: 20px;
-            border-radius: 10px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-            text-align: center;
-        }}
-        .content-section {{
-            background: white;
-            padding: 30px;
-            border-radius: 10px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-            margin-bottom: 20px;
-        }}
-        .keywords {{
-            display: flex;
-            flex-wrap: wrap;
-            gap: 10px;
-            margin-top: 15px;
-        }}
-        .keyword {{
-            background: #e9ecef;
-            padding: 5px 10px;
-            border-radius: 20px;
-            font-size: 0.9em;
-        }}
-        .optimized-content {{
-            background: #f8f9fa;
-            padding: 20px;
-            border-left: 4px solid #28a745;
-            border-radius: 5px;
-            font-style: italic;
-        }}
-    </style>
-</head>
-<body>
-    <div class="header">
-        <h1>🔧 Content Enhancement Report</h1>
-        <p>AI-Optimized Content Analysis Results</p>
-        <p><strong>Analysis Date:</strong> {analysis.get('analysis_date', 'Unknown')}</p>
-    </div>
-    <div class="scores">
-        <div class="score-card">
-            <h3>Clarity Score</h3>
-            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
-                {scores.get('clarity', 0):.1f}/10
-            </div>
-        </div>
-        <div class="score-card">
-            <h3>Structure Score</h3>
-            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
-                {scores.get('structure', 0):.1f}/10
-            </div>
-        </div>
-        <div class="score-card">
-            <h3>Answerability Score</h3>
-            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
-                {scores.get('answerability', 0):.1f}/10
-            </div>
-        </div>
-        <div class="score-card">
-            <h3>Overall Average</h3>
-            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
-                {scores.get('overall_average', 0):.1f}/10
-            </div>
-        </div>
-    </div>
-    <div class="content-section">
-        <h2>🔑 Identified Keywords</h2>
-        <div class="keywords">
-            {' '.join([f'<span class="keyword">{keyword}</span>' for keyword in optimization.get('keywords_identified', [])])}
-        </div>
-    </div>
-    {'<div class="content-section"><h2>✨ Optimized Content</h2><div class="optimized-content">' + optimization.get('optimized_content', '') + '</div></div>' if optimization.get('optimized_content') and not optimization.get('analyze_only') else ''}
-    <div class="content-section">
-        <h2>💡 Improvements Made</h2>
-        <ul>
-            {' '.join([f'<li>{improvement}</li>' for improvement in optimization.get('improvements_made', [])])}
-        </ul>
-    </div>
-    <div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
-        <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
-    </div>
-</body>
-</html>
-            """
-            return html_content
-        except Exception as e:
-            return f"<html><body><h1>Enhancement HTML Export Error</h1><p>{str(e)}</p></body></html>"
-    def _export_enhancement_csv(self, data: Dict[str, Any]) -> str:
-        """Export content enhancement results as CSV"""
-        try:
-            output = io.StringIO()
-            writer = csv.writer(output)
-            # Header information
-            analysis = data.get('content_analysis', {})
-            scores = data.get('performance_scores', {})
-            optimization = data.get('optimization_results', {})
-            writer.writerow(['Content Enhancement Analysis Report'])
-            writer.writerow(['Analysis Date:', analysis.get('analysis_date', 'Unknown')])
-            writer.writerow(['Original Content Length:', analysis.get('original_content_length', 0)])
-            writer.writerow(['Original Word Count:', analysis.get('original_word_count', 0)])
-            writer.writerow([])
-            # Performance scores
-            writer.writerow(['Performance Scores'])
-            writer.writerow(['Metric', 'Score'])
-            for metric, score in scores.items():
-                writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
-            writer.writerow([])
-            writer.writerow(['Keywords Identified'])
-            for keyword in optimization.get('keywords_identified', []):
-                writer.writerow([keyword])
-            writer.writerow([])
-            writer.writerow(['Improvements Made'])
-            for improvement in optimization.get('improvements_made', []):
-                writer.writerow([improvement])
-            return output.getvalue()
-        except Exception as e:
-            return f"Enhancement CSV export error: {str(e)}"
-    def _export_qa_html(self, data: Dict[str, Any]) -> str:
-        """Export Q&A results as HTML"""
-        try:
-            session = data.get('qa_session', {})
-            summary = data.get('summary', {})
-            interactions = session.get('interactions', [])
-            html_content = f"""
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Q&A Session Report</title>
-    <style>
-        body {{
-            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
-            line-height: 1.6;
-            color: #333;
-            max-width: 1000px;
-            margin: 0 auto;
-            padding: 20px;
-            background-color: #f8f9fa;
-        }}
-        .header {{
-            background: linear-gradient(135deg, #6f42c1 0%, #e83e8c 100%);
-            color: white;
-            padding: 30px;
-            border-radius: 10px;
-            margin-bottom: 30px;
-            text-align: center;
-        }}
-        .summary {{
-            display: grid;
-            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
-            gap: 20px;
-            margin-bottom: 30px;
-        }}
-        .summary-card {{
-            background: white;
-            padding: 20px;
-            border-radius: 10px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-            text-align: center;
-        }}
-        .qa-item {{
-            background: white;
-            padding: 20px;
-            border-radius: 10px;
-            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-            margin-bottom: 20px;
-        }}
-        .question {{
-            background: #e9ecef;
-            padding: 15px;
-            border-left: 4px solid #6f42c1;
-            border-radius: 5px;
-            margin-bottom: 15px;
-        }}
-        .answer {{
-            padding: 15px;
-            border-left: 4px solid #28a745;
-            border-radius: 5px;
-            background: #f8f9fa;
-        }}
-        .sources {{
-            margin-top: 15px;
-            padding: 10px;
-            background: #fff3cd;
-            border-radius: 5px;
-            font-size: 0.9em;
-        }}
-    </style>
-</head>
-<body>
-    <div class="header">
-        <h1>💬 Q&A Session Report</h1>
-        <p>Document Question & Answer Analysis</p>
-        <p><strong>Session Date:</strong> {session.get('session_date', 'Unknown')}</p>
-    </div>
-    <div class="summary">
-        <div class="summary-card">
-            <h3>Total Questions</h3>
-            <div style="font-size: 2em; font-weight: bold; color: #6f42c1;">
-                {session.get('total_questions', 0)}
-            </div>
-        </div>
-        <div class="summary-card">
-            <h3>Successful Answers</h3>
-            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
-                {summary.get('successful_answers', 0)}
-            </div>
-        </div>
-        <div class="summary-card">
-            <h3>Avg Response Length</h3>
-            <div style="font-size: 2em; font-weight: bold; color: #17a2b8;">
-                {summary.get('average_response_length', 0):.0f}
-            </div>
-        </div>
-    </div>
-    <h2>📝 Q&A Interactions</h2>
-    """
-            # Add individual Q&A items
-            for i, interaction in enumerate(interactions, 1):
-                question = interaction.get('query', 'No question')
-                answer = interaction.get('result', interaction.get('answer', 'No answer'))
-                sources = interaction.get('sources', [])
-                html_content += f"""
-    <div class="qa-item">
-        <h3>Question {i}</h3>
-        <div class="question">
-            <strong>Q:</strong> {question}
-        </div>
-        <div class="answer">
-            <strong>A:</strong> {answer}
-        </div>
-        """
-                if sources:
-                    html_content += '<div class="sources"><strong>Sources:</strong><ul>'
-                    for source in sources[:3]:  # Limit to first 3 sources
-                        content_preview = source.get('content', '')[:200] + '...' if len(source.get('content', '')) > 200 else source.get('content', '')
-                        html_content += f'<li>{content_preview}</li>'
-                    html_content += '</ul></div>'
-                html_content += '</div>'
-            html_content += f"""
-    <div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
-        <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
-    </div>
-</body>
-</html>
-            """
-            return html_content
-        except Exception as e:
-            return f"<html><body><h1>Q&A HTML Export Error</h1><p>{str(e)}</p></body></html>"
-    def _export_qa_csv(self, data: Dict[str, Any]) -> str:
-        """Export Q&A results as CSV"""
-        try:
-            output = io.StringIO()
-            writer = csv.writer(output)
-            session = data.get('qa_session', {})
-            summary = data.get('summary', {})
-            interactions = session.get('interactions', [])
-            # Header
-            writer.writerow(['Q&A Session Report'])
-            writer.writerow(['Session Date:', session.get('session_date', 'Unknown')])
-            writer.writerow(['Total Questions:', session.get('total_questions', 0)])
-            writer.writerow(['Successful Answers:', summary.get('successful_answers', 0)])
-            writer.writerow([])
-            # Q&A data
-            writer.writerow(['Question Index', 'Question', 'Answer', 'Has Sources', 'Answer Length'])
-            for i, interaction in enumerate(interactions, 1):
-                question = interaction.get('query', 'No question')
-                answer = interaction.get('result', interaction.get('answer', 'No answer'))
-                has_sources = 'Yes' if interaction.get('sources') else 'No'
-                answer_length = len(answer) if answer else 0
-                writer.writerow([i, question, answer, has_sources, answer_length])
-            return output.getvalue()
-        except Exception as e:
-            return f"Q&A CSV export error: {str(e)}"
-    def _export_batch_excel(self, data: Dict[str, Any]) -> bytes:
-        """Export batch results as Excel file"""
-        try:
-            output = io.BytesIO()
-            with pd.ExcelWriter(output, engine='openpyxl') as writer:
-                # Batch metadata sheet
-                metadata = data.get('batch_metadata', {})
-                metadata_df = pd.DataFrame([
-                    {'Property': k, 'Value': v} for k, v in metadata.items()
-                ])
-                metadata_df.to_excel(writer, sheet_name='Batch Metadata', index=False)
-                # Batch summary sheet
-                summary = data.get('batch_summary', {})
-                summary_df = pd.DataFrame([
-                    {'Metric': k, 'Value': v} for k, v in summary.items()
-                ])
-                summary_df.to_excel(writer, sheet_name='Batch Summary', index=False)
-                # Individual results sheet
-                results = data.get('batch_results', [])
-                if results:
-                    # Flatten results for tabular format
-                    flattened_results = []
-                    for i, result in enumerate(results):
-                        flat_result = {'Batch_Index': i}
-                        self._flatten_dict(result, flat_result)
-                        flattened_results.append(flat_result)
-                    results_df = pd.DataFrame(flattened_results)
-                    results_df.to_excel(writer, sheet_name='Batch Results', index=False)
-            output.seek(0)
-            return output.getvalue()
-        except Exception as e:
-            error_content = f"Batch Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
-            return error_content.encode('utf-8')
-    def _export_batch_csv(self, data: Dict[str, Any]) -> str:
-        """Export batch results as CSV"""
-        try:
-            output = io.StringIO()
-            writer = csv.writer(output)
-            # Batch metadata
-            metadata = data.get('batch_metadata', {})
-            writer.writerow(['Batch Analysis Results'])
-            writer.writerow(['Export Timestamp:', data.get('export_timestamp', 'Unknown')])
-            writer.writerow([])
-            writer.writerow(['Batch Metadata'])
-            for key, value in metadata.items():
-                writer.writerow([key, value])
-            writer.writerow([])
-            # Batch summary
-            summary = data.get('batch_summary', {})
-            writer.writerow(['Batch Summary'])
-            for key, value in summary.items():
-                writer.writerow([key, value])
-            writer.writerow([])
-            # Individual results (simplified)
-            results = data.get('batch_results', [])
-            if results:
-                writer.writerow(['Individual Results'])
-                writer.writerow(['Index', 'Status', 'Summary'])
-                for i, result in enumerate(results):
-                    status = 'Success' if not result.get('error') else 'Error'
-                    summary_text = str(result)[:100] + '...' if len(str(result)) > 100 else str(result)
-                    writer.writerow([i, status, summary_text])
-            return output.getvalue()
-        except Exception as e:
-            return f"Batch CSV export error: {str(e)}"
-    def _export_geo_pdf(self, data: Dict[str, Any]) -> bytes:
-        """Export GEO data as PDF (placeholder - would need reportlab)"""
-        try:
-            # For now, return HTML content as bytes
-            # In a full implementation, you'd use reportlab or weasyprint
-            html_content = self._export_geo_html(data)
-            return html_content.encode('utf-8')
-        except Exception as e:
-            error_content = f"PDF export not fully implemented. Error: {str(e)}"
-            return error_content.encode('utf-8')
-    def _create_executive_summary(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Create executive summary report"""
-        try:
-            geo_results = analysis_data.get('geo_results', [])
-            enhancement_results = analysis_data.get('enhancement_results', {})
-            qa_results = analysis_data.get('qa_results', [])
-            # Calculate key metrics
-            overall_performance = self._calculate_overall_performance(analysis_data)
-            return {
-                'executive_summary': {
-                    'overall_performance_score': overall_performance,
-                    'key_findings': self._extract_key_findings(analysis_data),
-                    'priority_recommendations': self._get_priority_recommendations(analysis_data),
-                    'roi_potential': self._estimate_roi_potential(overall_performance),
-                    'implementation_timeline': self._suggest_implementation_timeline(analysis_data),
-                    'resource_requirements': self._estimate_resource_requirements(analysis_data)
-                }
-            }
-        except Exception as e:
-            return {'error': f"Executive summary creation failed: {str(e)}"}
-    def _create_summary_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Create summary report"""
-        try:
-            return {
-                'summary_report': {
-                    'analysis_overview': self._create_analysis_overview(analysis_data),
-                    'performance_metrics': self._summarize_performance_metrics(analysis_data),
-                    'improvement_opportunities': self._identify_improvement_opportunities(analysis_data),
-                    'competitive_position': self._assess_competitive_position(analysis_data),
-                    'next_steps': self._recommend_next_steps(analysis_data)
-                }
-            }
-        except Exception as e:
-            return {'error': f"Summary report creation failed: {str(e)}"}
-    def _create_full_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Create full detailed report"""
-        try:
-            return {
-                'full_report': {
-                    'executive_summary': self._create_executive_summary(analysis_data).get('executive_summary', {}),
-                    'detailed_analysis': {
-                        'geo_analysis_details': analysis_data.get('geo_results', []),
-                        'content_optimization_details': analysis_data.get('enhancement_results', {}),
-                        'qa_performance_details': analysis_data.get('qa_results', [])
-                    },
-                    'methodology': self._document_methodology(),
-                    'data_sources': self._document_data_sources(analysis_data),
-                    'limitations': self._document_limitations(),
-                    'appendices': self._create_appendices(analysis_data)
-                }
-            }
-        except Exception as e:
-            return {'error': f"Full report creation failed: {str(e)}"}
-    def _create_batch_summary(self, batch_results: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """Create summary of batch processing results"""
-        try:
-            total_items = len(batch_results)
-            successful_items = len([r for r in batch_results if not r.get('error')])
-            failed_items = total_items - successful_items
-            return {
-                'total_items': total_items,
-                'successful_items': successful_items,
-                'failed_items': failed_items,
-                'success_rate': (successful_items / total_items * 100) if total_items > 0 else 0,
-                'processing_status': 'Completed',
-                'average_processing_time': self._calculate_avg_processing_time(batch_results),
-                'common_errors': self._identify_common_errors(batch_results)
-            }
-        except Exception as e:
-            return {'error': f"Batch summary creation failed: {str(e)}"}
-    def _generate_performance_insights(self, scores: Dict[str, float], overall_avg: float) -> List[str]:
-        """Generate performance insights from scores"""
-        insights = []
-        try:
-            # Overall performance insight
-            if overall_avg >= 8.0:
-                insights.append("Excellent overall GEO performance - content is well-optimized for AI search engines")
-            elif overall_avg >= 6.0:
-                insights.append("Good GEO performance with room for improvement in specific areas")
-            elif overall_avg >= 4.0:
-                insights.append("Moderate GEO performance - significant optimization opportunities exist")
-            else:
-                insights.append("Low GEO performance - comprehensive optimization needed")
-            # Specific metric insights
-            for metric, score in scores.items():
-                if score < 5.0:
-                    metric_name = metric.replace('_', ' ').title()
-                    insights.append(f"Low {metric_name} score ({score:.1f}) needs immediate attention")
-                elif score >= 8.5:
-                    metric_name = metric.replace('_', ' ').title()
-                    insights.append(f"Excellent {metric_name} score ({score:.1f}) - maintain current approach")
-            return insights[:5]  # Return top 5 insights
-        except Exception:
-            return ["Unable to generate performance insights"]
-    def _generate_package_readme(self, analysis_data: Dict[str, Any]) -> str:
-        """Generate README file for export package"""
-        try:
-            readme_content = f"""
-GEO SEO AI Optimizer - Analysis Package
-======================================
-Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-This package contains the complete analysis results from the GEO SEO AI Optimizer tool.
-Files Included:
-- JSON file: Complete raw data in JSON format
-- HTML file: Visual report for web viewing
-- CSV file: Tabular data for spreadsheet analysis
-- README.txt: This file
-About GEO (Generative Engine Optimization):
-GEO is the practice of optimizing content for AI-powered search engines and
-language models. Unlike traditional SEO, GEO focuses on:
-- AI search visibility
-- Query intent matching
-- Conversational readiness
-- Citation worthiness
-- Semantic richness
-- Context completeness
-How to Use These Files:
-1. Open the HTML file in a web browser for a visual report
-2. Import the CSV file into Excel or Google Sheets for analysis
-3. Use the JSON file for programmatic processing or integration
-For more information about GEO optimization, visit the tool documentation.
-Generated by: GEO SEO AI Optimizer v1.0
-"""
-            return readme_content
-        except Exception as e:
-            return f"README generation failed: {str(e)}"
-    # Helper methods for data processing and analysis
-    def _get_performance_level(self, score: float) -> str:
-        """Get performance level description for a score"""
-        if score >= 8.0:
-            return "Excellent"
-        elif score >= 6.0:
-            return "Good"
-        elif score >= 4.0:
-            return "Fair"
-        else:
-            return "Needs Improvement"
-    def _categorize_recommendation(self, recommendation: str) -> str:
-        """Categorize a recommendation based on content"""
-        rec_lower = recommendation.lower()
-        if any(word in rec_lower for word in ['structure', 'heading', 'format']):
-            return "Content Structure"
-        elif any(word in rec_lower for word in ['keyword', 'semantic', 'topic']):
-            return "SEO & Keywords"
-        elif any(word in rec_lower for word in ['clarity', 'readability', 'language']):
-            return "Content Quality"
-        elif any(word in rec_lower for word in ['technical', 'schema', 'markup']):
-            return "Technical SEO"
-        else:
-            return "General"
-    def _calculate_avg_response_length(self, qa_results: List[Dict[str, Any]]) -> float:
-        """Calculate average response length for Q&A results"""
-        try:
-            response_lengths = []
-            for result in qa_results:
-                answer = result.get('result', result.get('answer', ''))
-                if answer and not result.get('error'):
-                    response_lengths.append(len(answer))
-            return sum(response_lengths) / len(response_lengths) if response_lengths else 0
-        except Exception:
-            return 0
-    def _extract_common_topics(self, qa_results: List[Dict[str, Any]]) -> List[str]:
-        """Extract common topics from Q&A results"""
-        try:
-            # Simple topic extraction based on question keywords
-            topics = {}
-            for result in qa_results:
-                question = result.get('query', result.get('question', ''))
-                if question:
-                    words = question.lower().split()
-                    for word in words:
-                        if len(word) > 4:  # Focus on longer words
-                            topics[word] = topics.get(word, 0) + 1
-            # Return top 5 most common topics
-            sorted_topics = sorted(topics.items(), key=lambda x: x[1], reverse=True)
-            return [topic for topic, count in sorted_topics[:5]]
-        except Exception:
-            return []
-    def _flatten_dict(self, d: Dict[str, Any], parent_dict: Dict[str, Any], parent_key: str = '') -> None:
-        """Flatten nested dictionary for tabular export"""
-        try:
-            for key, value in d.items():
-                new_key = f"{parent_key}_{key}" if parent_key else key
-                if isinstance(value, dict):
-                    self._flatten_dict(value, parent_dict, new_key)
-                elif isinstance(value, list):
-                    parent_dict[new_key] = json.dumps(value)  # Convert lists to JSON strings
-                else:
-                    parent_dict[new_key] = value
-        except Exception:
-            pass  # Skip problematic keys
-    def _calculate_overall_performance(self, analysis_data: Dict[str, Any]) -> float:
-        """Calculate overall performance score across all analyses"""
-        try:
-            scores = []
-            # GEO scores
-            geo_results = analysis_data.get('geo_results', [])
-            for result in geo_results:
-                if 'geo_scores' in result:
-                    geo_score_values = list(result['geo_scores'].values())
-                    if geo_score_values:
-                        scores.append(sum(geo_score_values) / len(geo_score_values))
-            # Enhancement scores
-            enhancement = analysis_data.get('enhancement_results', {})
-            if 'scores' in enhancement:
-                enh_scores = list(enhancement['scores'].values())
-                if enh_scores:
-                    scores.append(sum(enh_scores) / len(enh_scores))
-            return sum(scores) / len(scores) if scores else 0
-        except Exception:
-            return 0
-    def _extract_key_findings(self, analysis_data: Dict[str, Any]) -> List[str]:
-        """Extract key findings from analysis data"""
-        findings = []
-        try:
-            # Add findings based on performance scores
-            overall_perf = self._calculate_overall_performance(analysis_data)
-            if overall_perf >= 8.0:
-                findings.append("Content demonstrates excellent AI search optimization")
-            elif overall_perf <= 4.0:
-                findings.append("Significant optimization opportunities identified")
-            # Add more specific findings based on data
-            geo_results = analysis_data.get('geo_results', [])
-            if geo_results:
-                findings.append(f"Analyzed {len(geo_results)} pages for GEO performance")
-            enhancement = analysis_data.get('enhancement_results', {})
-            if enhancement and 'keywords' in enhancement:
-                findings.append(f"Identified {len(enhancement['keywords'])} key optimization terms")
-            return findings[:5]  # Return top 5 findings
-        except Exception:
-            return ["Unable to extract key findings"]
-    def _get_priority_recommendations(self, analysis_data: Dict[str, Any]) -> List[str]:
-        """Get priority recommendations from analysis"""
-        try:
-            recommendations = []
-            # Collect all recommendations from different analyses
-            geo_results = analysis_data.get('geo_results', [])
-            for result in geo_results:
-                recommendations.extend(result.get('recommendations', []))
-            # Remove duplicates and return top priorities
-            unique_recs = list(set(recommendations))
-            return unique_recs[:3]  # Top 3 priority recommendations
-        except Exception:
-            return ["Review and implement GEO best practices"]
-    def _estimate_roi_potential(self, performance_score: float) -> str:
-        """Estimate ROI potential based on performance score"""
-        if performance_score <= 4.0:
-            return "High - Significant improvement potential"
-        elif performance_score <= 6.0:
-            return "Medium - Moderate improvement opportunities"
-        else:
-            return "Low - Already well-optimized"
-    def _suggest_implementation_timeline(self, analysis_data: Dict[str, Any]) -> str:
-        """Suggest implementation timeline"""
-        try:
-            overall_perf = self._calculate_overall_performance(analysis_data)
-            if overall_perf <= 4.0:
-                return "3-6 months for comprehensive optimization"
-            elif overall_perf <= 6.0:
-                return "1-3 months for targeted improvements"
-            else:
-                return "Ongoing maintenance and monitoring"
-        except Exception:
-            return "Timeline assessment unavailable"
-    def _estimate_resource_requirements(self, analysis_data: Dict[str, Any]) -> Dict[str, str]:
-        """Estimate resource requirements"""
-        return {
-            'content_team': 'Required for content optimization',
-            'technical_team': 'Required for technical implementations',
-            'timeline': self._suggest_implementation_timeline(analysis_data),
-            'budget': 'Varies based on scope of optimizations'
-        }
-    def _create_analysis_overview(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Create analysis overview"""
-        try:
-            return {
-                'analyses_performed': list(analysis_data.keys()),
-                'total_items_analyzed': sum(len(v) if isinstance(v, list) else 1 for v in analysis_data.values()),
-                'analysis_scope': 'Comprehensive GEO and content optimization analysis',
-                'key_focus_areas': ['AI Search Optimization', 'Content Enhancement', 'Performance Analysis']
-            }
-        except Exception:
-            return {'error': 'Overview creation failed'}
-    def _summarize_performance_metrics(self, analysis_data: Dict[str, Any]) -> Dict[str, float]:
-        """Summarize performance metrics"""
-        try:
-            return {
-                'overall_performance': self._calculate_overall_performance(analysis_data),
-                'optimization_potential': 10 - self._calculate_overall_performance(analysis_data),
-                'completion_rate': 100.0  # Assuming analysis completed successfully
-            }
-        except Exception:
-            return {}
-    def _identify_improvement_opportunities(self, analysis_data: Dict[str, Any]) -> List[str]:
-        """Identify improvement opportunities"""
-        return self._get_priority_recommendations(analysis_data)
-    def _assess_competitive_position(self, analysis_data: Dict[str, Any]) -> str:
-        """Assess competitive position"""
-        try:
-            overall_perf = self._calculate_overall_performance(analysis_data)
-            if overall_perf >= 8.0:
-                return "Strong - Above average GEO performance"
-            elif overall_perf >= 6.0:
-                return "Competitive - Meeting industry standards"
-            elif overall_perf >= 4.0:
-                return "Below Average - Improvement needed"
-            else:
-                return "Weak - Significant optimization required"
-        except Exception:
-            return "Assessment unavailable"
-    def _recommend_next_steps(self, analysis_data: Dict[str, Any]) -> List[str]:
-        """Recommend next steps"""
-        steps = [
-            "Review detailed analysis results",
-            "Prioritize recommendations by impact",
-            "Develop implementation plan",
-            "Monitor performance improvements"
-        ]
-        # Add specific steps based on performance
-        overall_perf = self._calculate_overall_performance(analysis_data)
-        if overall_perf <= 4.0:
-            steps.insert(1, "Focus on fundamental GEO optimization")
-        return steps
-    def _document_methodology(self) -> Dict[str, str]:
-        """Document analysis methodology"""
-        return {
-            'geo_analysis': 'AI-powered content analysis using specialized GEO metrics',
-            'content_optimization': 'LLM-based content enhancement and scoring',
-            'performance_scoring': 'Multi-dimensional scoring system for AI search optimization',
-            'data_collection': 'Automated content parsing and analysis',
-            'validation': 'Cross-referenced metrics and quality assurance checks'
-        }
-    def _document_data_sources(self, analysis_data: Dict[str, Any]) -> List[str]:
-        """Document data sources used in analysis"""
-        sources = []
-        if 'geo_results' in analysis_data:
-            sources.append("Website content analysis")
-        if 'enhancement_results' in analysis_data:
-            sources.append("Content optimization analysis")
-        if 'qa_results' in analysis_data:
-            sources.append("Document Q&A interactions")
-        sources.extend([
-            "AI-powered content scoring",
-            "GEO performance metrics",
-            "Industry best practices database"
-        ])
-        return sources
-    def _document_limitations(self) -> List[str]:
-        """Document analysis limitations"""
-        return [
-            "Analysis based on current content snapshot",
-            "Performance may vary with search engine algorithm updates",
-            "Recommendations require human review for implementation",
-            "Results depend on quality of input content",
-            "AI model performance may vary across different content types"
-        ]
-    def _create_appendices(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Create report appendices"""
-        try:
-            return {
-                'technical_details': {
-                    'models_used': ['GPT-based content analysis', 'Semantic similarity scoring'],
-                    'processing_time': 'Variable based on content volume',
-                    'confidence_intervals': 'Scores provided with ±0.5 accuracy'
-                },
-                'glossary': {
-                    'GEO': 'Generative Engine Optimization - optimization for AI search engines',
-                    'AI Search Visibility': 'Likelihood of content appearing in AI search results',
-                    'Citation Worthiness': 'Probability of content being cited by AI systems',
-                    'Conversational Readiness': 'Suitability for AI chat responses'
-                },
-                'references': [
-                    'GEO Best Practices Guide',
-                    'AI Search Engine Optimization Standards',
-                    'Content Performance Benchmarks'
-                ]
-            }
-        except Exception:
-            return {}
-    def _calculate_avg_processing_time(self, batch_results: List[Dict[str, Any]]) -> float:
-        """Calculate average processing time for batch results"""
-        try:
-            processing_times = []
-            for result in batch_results:
-                if 'processing_time' in result:
-                    processing_times.append(result['processing_time'])
-            return sum(processing_times) / len(processing_times) if processing_times else 0
-        except Exception:
-            return 0
-    def _identify_common_errors(self, batch_results: List[Dict[str, Any]]) -> List[str]:
-        """Identify common errors in batch processing"""
-        try:
-            error_counts = {}
-            for result in batch_results:
-                if result.get('error'):
-                    error_msg = str(result['error'])[:50]  # First 50 chars
-                    error_counts[error_msg] = error_counts.get(error_msg, 0) + 1
-            # Return top 3 most common errors
-            sorted_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)
-            return [error for error, count in sorted_errors[:3]]
-        except Exception:
-            return []
-class DataValidator:
-    """Helper class for validating export data"""
-    @staticmethod
-    def validate_geo_data(geo_results: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """Validate GEO analysis data structure"""
-        validation_result = {
-            'valid': True,
-            'errors': [],
-            'warnings': []
-        }
-        try:
-            if not geo_results:
-                validation_result['errors'].append("No GEO results provided")
-                validation_result['valid'] = False
-                return validation_result
-            for i, result in enumerate(geo_results):
-                # Check required fields
-                if 'geo_scores' not in result:
-                    validation_result['warnings'].append(f"Result {i} missing geo_scores")
-                if 'page_data' not in result:
-                    validation_result['warnings'].append(f"Result {i} missing page_data")
-                # Validate score ranges
-                if 'geo_scores' in result:
-                    for metric, score in result['geo_scores'].items():
-                        if not isinstance(score, (int, float)) or score < 0 or score > 10:
-                            validation_result['errors'].append(f"Invalid score for {metric} in result {i}")
-                            validation_result['valid'] = False
-            return validation_result
-        except Exception as e:
-            validation_result['errors'].append(f"Validation failed: {str(e)}")
-            validation_result['valid'] = False
-            return validation_result
-    @staticmethod
-    def validate_enhancement_data(enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
-        """Validate content enhancement data structure"""
-        validation_result = {
-            'valid': True,
-            'errors': [],
-            'warnings': []
-        }
-        try:
-            # Check for required fields
-            if 'scores' not in enhancement_result:
-                validation_result['warnings'].append("Enhancement result missing scores")
-            # Validate score structure
-            if 'scores' in enhancement_result:
-                scores = enhancement_result['scores']
-                required_scores = ['clarity', 'structuredness', 'answerability']
-                for req_score in required_scores:
-                    if req_score not in scores:
-                        validation_result['warnings'].append(f"Missing {req_score} score")
-                    elif not isinstance(scores[req_score], (int, float)):
-                        validation_result['errors'].append(f"Invalid {req_score} score type")
-                        validation_result['valid'] = False
-            return validation_result
-        except Exception as e:
-            validation_result['errors'].append(f"Enhancement validation failed: {str(e)}")
-            validation_result['valid'] = False
-            return validation_result
-class ExportManager:
-    """High-level export management class"""
-    def __init__(self):
-        self.exporter = ResultExporter()
-        self.validator = DataValidator()
-        self.export_history = []
-    def export_with_validation(self, data: Dict[str, Any], data_type: str,
-                              format_type: str = 'json') -> Dict[str, Any]:
-        """Export data with validation"""
-        try:
-            # Validate data first
-            if data_type == 'geo_analysis':
-                validation = self.validator.validate_geo_data(data.get('geo_results', []))
-            elif data_type == 'content_optimization':
-                validation = self.validator.validate_enhancement_data(data)
-            else:
-                validation = {'valid': True, 'errors': [], 'warnings': []}
-            # Proceed with export if validation passes
-            if validation['valid']:
-                if data_type == 'geo_analysis':
-                    result = self.exporter.export_geo_results(
-                        data.get('geo_results', []),
-                        data.get('website_url', 'unknown'),
-                        format_type
-                    )
-                elif data_type == 'content_optimization':
-                    result = self.exporter.export_enhancement_results(data, format_type)
-                else:
-                    result = json.dumps(data, indent=2, ensure_ascii=False)
-                # Log export
-                self.export_history.append({
-                    'timestamp': datetime.now().isoformat(),
-                    'data_type': data_type,
-                    'format_type': format_type,
-                    'validation_warnings': validation.get('warnings', []),
-                    'success': True
-                })
-                return {
-                    'success': True,
-                    'data': result,
-                    'validation': validation
-                }
-            else:
-                return {
-                    'success': False,
-                    'error': 'Data validation failed',
-                    'validation': validation
-                }
-        except Exception as e:
-            self.export_history.append({
-                'timestamp': datetime.now().isoformat(),
-                'data_type': data_type,
-                'format_type': format_type,
-                'success': False,
-                'error': str(e)
-            })
-            return {
-                'success': False,
-                'error': f"Export failed: {str(e)}"
-            }
-    def get_export_history(self) -> List[Dict[str, Any]]:
-        """Get export history"""
-        return self.export_history
-    def clear_export_history(self) -> None:
-        """Clear export history"""
-        self.export_history.clear()
-    def get_supported_formats(self) -> Dict[str, List[str]]:
-        """Get supported export formats by data type"""
-        return {
-            'geo_analysis': ['json', 'csv', 'html', 'xlsx', 'pdf'],
-            'content_optimization': ['json', 'html', 'csv'],
-            'qa_results': ['json', 'html', 'csv'],
-            'batch_analysis': ['json', 'xlsx', 'csv']
-        }
-    def create_multi_format_export(self, data: Dict[str, Any], data_type: str,
-                                  formats: List[str] = None) -> Dict[str, Any]:
-        """Create export in multiple formats"""
-        if formats is None:
-            formats = ['json', 'html', 'csv']
-        results = {}
-        for format_type in formats:
-            try:
-                export_result = self.export_with_validation(data, data_type, format_type)
-                if export_result['success']:
-                    results[format_type] = export_result['data']
-                else:
-                    results[format_type] = {'error': export_result['error']}
-            except Exception as e:
-                results[format_type] = {'error': str(e)}
-        return {
-            'multi_format_export': results,
-            'formats_generated': list(results.keys()),
-            'successful_formats': [fmt for fmt, data in results.items() if 'error' not in data]
-        }
-# Utility functions for the export module
-def create_export_template(data_type: str) -> Dict[str, Any]:
-    """Create export template for different data types"""
-    templates = {
-        'geo_analysis': {
-            'website_url': 'https://example.com',
-            'geo_results': [
-                {
-                    'page_data': {
-                        'url': 'https://example.com/page1',
-                        'title': 'Example Page',
-                        'word_count': 500
-                    },
-                    'geo_scores': {
-                        'ai_search_visibility': 7.5,
-                        'query_intent_matching': 6.8,
-                        'conversational_readiness': 8.2,
-                        'citation_worthiness': 7.1
-                    },
-                    'recommendations': [
-                        'Improve content structure',
-                        'Add more specific examples'
-                    ]
-                }
-            ]
-        },
-        'content_optimization': {
-            'scores': {
-                'clarity': 7.5,
-                'structuredness': 6.8,
-                'answerability': 8.2
-            },
-            'keywords': ['example', 'optimization', 'content'],
-            'optimized_text': 'This is the optimized version of the content...',
-            'optimization_suggestions': [
-                'Improve sentence structure',
-                'Add more specific keywords'
-            ]
-        },
-        'qa_results': [
-            {
-                'query': 'What is the main topic?',
-                'result': 'The main topic is content optimization for AI systems.',
-                'sources': [
-                    {
-                        'content': 'Source document content...',
-                        'metadata': {'source': 'document1.pdf'}
-                    }
-                ]
-            }
-        ]
-    }
-    return templates.get(data_type, {})
-def export_demo_data() -> Dict[str, Any]:
-    """Export demonstration data for testing"""
-    demo_data = {
-        'geo_analysis_demo': create_export_template('geo_analysis'),
-        'content_optimization_demo': create_export_template('content_optimization'),
-        'qa_results_demo': create_export_template('qa_results')
-    }
-    return demo_data
-# Export the main classes and functions
-__all__ = [
-    'ResultExporter',
-    'GEOReport',
-    'ContentAnalysis',
-    'DataValidator',
-    'ExportManager',
-    'create_export_template',
-    'export_demo_data'
-]
-# Example usage for testing
-if __name__ == "__main__":
-    # Create exporter instance
-    exporter = ResultExporter()
-    # Test with demo data
-    demo_geo_data = create_export_template('geo_analysis')
-    # Export in different formats
-    json_export = exporter.export_geo_results(
-        demo_geo_data['geo_results'],
-        demo_geo_data['website_url'],
-        'json'
-    )
-    html_export = exporter.export_geo_results(
-        demo_geo_data['geo_results'],
-        demo_geo_data['website_url'],
-        'html'
-    )
-    print("JSON Export:", json_export[:200] + "..." if len(str(json_export)) > 200 else json_export)
-    print("\nHTML Export:", html_export[:200] + "..." if len(str(html_export)) > 200 else html_export)
-    # Test enhancement export
-    demo_enhancement = create_export_template('content_optimization')
-    enhancement_export = exporter.export_enhancement_results(demo_enhancement, 'json')
-    print("\nEnhancement Export:", enhancement_export[:200] + "..." if len(str(enhancement_export)) > 200 else enhancement_export)

utils/optimizer.py DELETED Viewed

@@ -1,354 +0,0 @@
-# Enhanced Content Optimization Module with RAG for GEO
-# Integrates RAG functionality for better Generative Engine Optimization
-import json
-import re
-from typing import Dict, Any, List, Optional
-from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
-from langchain.schema import Document
-class ContentOptimizer:
-    """Enhanced Content Optimizer with RAG capabilities for GEO"""
-    def __init__(self, llm, vector_chunker=None):
-        self.llm = llm
-        self.vector_chunker = vector_chunker
-        self.setup_prompts()
-        self.setup_geo_knowledge_base()
-    def setup_geo_knowledge_base(self):
-        """Initialize GEO best practices knowledge base"""
-        self.geo_knowledge = [
-            """
-            Generative Engine Optimization (GEO) Best Practices:
-            1. Structure for AI Consumption:
-            - Use clear headings and subheadings
-            - Include bullet points and numbered lists
-            - Provide direct, concise answers to common questions
-            - Use schema markup when possible
-            2. Content Format for LLMs:
-            - Answer questions directly in the first sentence
-            - Use "what, why, how" question patterns
-            - Include relevant entities and proper nouns
-            - Maintain factual accuracy with citations
-            3. Semantic Optimization:
-            - Include related terms and synonyms
-            - Use entity-rich content (people, places, organizations)
-            - Connect concepts with clear relationships
-            - Optimize for topic clusters, not just keywords
-            """,
-            """
-            AI Search Visibility Optimization:
-            1. Query Intent Matching:
-            - Address user intent explicitly
-            - Use natural language patterns
-            - Include question-answer pairs
-            - Optimize for conversational queries
-            2. Citation Worthiness:
-            - Include authoritative sources and data
-            - Use specific facts and statistics
-            - Provide expert opinions and insights
-            - Maintain consistent tone and expertise
-            3. Multi-Query Coverage:
-            - Address related questions in the same content
-            - Use comprehensive topic coverage
-            - Include long-tail and specific queries
-            - Provide context for complex topics
-            """,
-            """
-            Content Structure for AI Systems:
-            1. Information Architecture:
-            - Lead with key information
-            - Use inverted pyramid structure
-            - Include table of contents for long content
-            - Break complex topics into digestible sections
-            2. Conversational Readiness:
-            - Write in active voice
-            - Use clear, direct language
-            - Include transitional phrases
-            - Optimize sentence length (12-20 words)
-            3. Context Completeness:
-            - Define technical terms
-            - Provide background information
-            - Include relevant examples
-            - Connect to broader topic context
-            """
-        ]
-    def setup_prompts(self):
-        """Initialize optimization prompts with RAG integration"""
-        self.rag_enhancement_prompt = """
-            You are a Generative Engine Optimization (GEO) specialist with access to best practices knowledge.
-            Based on the provided GEO knowledge and the user's content, optimize the content for:
-            1. AI search engines (ChatGPT, Claude, Gemini)
-            2. LLM-based question answering systems
-            3. Conversational AI interfaces
-            4. Citation and reference systems
-            Use the knowledge base to inform your optimization decisions.
-            Knowledge Base Context:
-            {context}
-            Original Content:
-            {content}
-            Provide comprehensive GEO optimization in JSON format:
-            ```json
-            {{
-              "geo_analysis": {{
-                "current_geo_score": 7.5,
-                "ai_search_visibility": 8.0,
-                "query_intent_matching": 7.0,
-                "conversational_readiness": 8.5,
-                "citation_worthiness": 6.5,
-                "context_completeness": 7.5
-              }},
-              "optimization_opportunities": [
-                {{
-                  "type": "Structure Enhancement",
-                  "description": "Add clear headings and Q&A format",
-                  "priority": "high",
-                  "expected_impact": "Improve AI parsing by 25%"
-                }}
-              ],
-              "optimized_content": {{
-                "enhanced_text": "Your optimized content here...",
-                "structural_improvements": ["Added FAQ section", "Improved headings"],
-                "semantic_enhancements": ["Added related terms", "Improved entity density"]
-              }},
-              "geo_keywords": {{
-                "primary_entities": ["entity1", "entity2"],
-                "semantic_terms": ["term1", "term2"],
-                "question_patterns": ["What is...", "How does..."],
-                "related_concepts": ["concept1", "concept2"]
-              }},
-              "recommendations": [
-                "Add more specific examples",
-                "Include authoritative citations",
-                "Improve conversational flow"
-              ]
-            }}
-            ```
-        """.strip()
-        self.competitive_geo_prompt = """
-            Analyze the content against GEO best practices and identify competitive optimization opportunities.
-            GEO Knowledge Base:
-            {context}
-            Content to Analyze:
-            {content}
-            Provide competitive GEO analysis:
-            ```json
-            {{
-              "competitive_gaps": {{
-                "missing_question_patterns": ["What questions aren't covered"],
-                "entity_gaps": ["Important entities not mentioned"],
-                "semantic_opportunities": ["Related terms to include"],
-                "structural_weaknesses": ["Formatting issues for AI"]
-              }},
-              "benchmark_comparison": {{
-                "current_performance": {{
-                  "ai_answerability": 6.5,
-                  "semantic_richness": 7.0,
-                  "structural_clarity": 8.0
-                }},
-                "optimization_potential": {{
-                  "ai_answerability": 9.0,
-                  "semantic_richness": 8.5,
-                  "structural_clarity": 9.5
-                }}
-              }},
-              "action_plan": [
-                {{
-                  "priority": "high",
-                  "action": "Add FAQ section",
-                  "rationale": "Improves direct question answering"
-                }}
-              ]
-            }}
-            ```
-        """.strip()
-    def optimize_content_with_rag(self, content: str, optimization_type: str = "geo_standard", analyze_only: bool = False) -> Dict[str, Any]:
-        try:
-            knowledge_docs = [Document(page_content=k, metadata={"source": "geo_best_practices"}) for k in self.geo_knowledge]
-            context = "\n\n".join(self.geo_knowledge)
-            if self.vector_chunker:
-                qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm)
-                geo_query = f"How to optimize this type of content for AI search engines: {content[:500]}"
-                context_result = qa_chain({"query": geo_query})
-                context = context_result.get("result", context)
-            return self._competitive_geo_optimization(content, context) if optimization_type == "competitive_geo" else self._standard_geo_optimization(content, context, analyze_only)
-        except Exception as e:
-            return {"error": f"RAG-enhanced optimization failed: {str(e)}"}
-    def _standard_geo_optimization(self, content: str, context: str, analyze_only: bool) -> Dict[str, Any]:
-        try:
-            prompt = ChatPromptTemplate.from_messages([
-                SystemMessagePromptTemplate.from_template(self.rag_enhancement_prompt),
-                HumanMessagePromptTemplate.from_template("Optimize this content using GEO best practices.")
-            ])
-            result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]})
-            parsed = self._parse_optimization_result(getattr(result, 'content', str(result)))
-            parsed.update({
-                'optimization_type': 'geo_standard',
-                'rag_enhanced': True,
-                'analyze_only': analyze_only,
-                'original_length': len(content),
-                'knowledge_sources': len(self.geo_knowledge)
-            })
-            return parsed
-        except Exception as e:
-            return {"error": f"Standard GEO optimization failed: {str(e)}"}
-    def _competitive_geo_optimization(self, content: str, context: str) -> Dict[str, Any]:
-        try:
-            prompt = ChatPromptTemplate.from_messages([
-                SystemMessagePromptTemplate.from_template(self.competitive_geo_prompt),
-                HumanMessagePromptTemplate.from_template("Perform competitive GEO analysis.")
-            ])
-            result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]})
-            parsed = self._parse_optimization_result(getattr(result, 'content', str(result)))
-            parsed.update({
-                'optimization_type': 'competitive_geo',
-                'rag_enhanced': True,
-                'competitive_analysis': True
-            })
-            return parsed
-        except Exception as e:
-            return {"error": f"Competitive GEO optimization failed: {str(e)}"}
-    def batch_optimize_with_rag(self, content_list: List[str], optimization_type: str = "geo_standard") -> List[Dict[str, Any]]:
-        results = []
-        for i, content in enumerate(content_list):
-            try:
-                result = self.optimize_content_with_rag(content, optimization_type)
-                result['batch_index'] = i
-                results.append(result)
-            except Exception as e:
-                results.append({
-                    'batch_index': i,
-                    'error': f"Batch GEO optimization failed: {str(e)}"
-                })
-        return results
-    def analyze_geo_readability(self, content: str) -> Dict[str, Any]:
-        try:
-            words = content.split()
-            sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
-            paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
-            metrics = {
-                'questions': len(re.findall(r'\?', content)),
-                'headings': len(re.findall(r'^#+\s', content, re.MULTILINE)),
-                'lists': len(re.findall(r'^\s*[-*+]\s', content, re.MULTILINE)),
-                'entities': len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content)),
-                'numbers': len(re.findall(r'\b\d+\.?\d*\b', content)),
-                'sentence_count': len(sentences),
-                'word_count': len(words)
-            }
-            geo_score = self._calculate_geo_readability_score({
-                'avg_words_per_sentence': metrics['word_count'] / metrics['sentence_count'] if metrics['sentence_count'] else 0,
-                'questions_ratio': metrics['questions'] / metrics['sentence_count'] if metrics['sentence_count'] else 0,
-                'structure_elements': metrics['headings'] + metrics['lists'],
-                'entity_density': metrics['entities'] / metrics['word_count'] if metrics['word_count'] else 0,
-                'numeric_data': metrics['numbers'] / metrics['word_count'] if metrics['word_count'] else 0
-            })
-            return {
-                'geo_readability_metrics': metrics,
-                'geo_readability_score': geo_score,
-                'geo_recommendations': self._generate_geo_recommendations(metrics)
-            }
-        except Exception as e:
-            return {'error': f"GEO readability analysis failed: {str(e)}"}
-    def _calculate_geo_readability_score(self, m: Dict[str, float]) -> float:
-        try:
-            score = (
-                max(0, 10 - abs(m['avg_words_per_sentence'] - 15) * 0.3) * 0.2 +
-                min(10, m['questions_ratio'] * 50) * 0.25 +
-                min(10, m['structure_elements'] * 1.5) * 0.25 +
-                min(10, m['entity_density'] * 100) * 0.15 +
-                min(10, m['numeric_data'] * 200) * 0.15
-            )
-            return round(score, 1)
-        except Exception:
-            return 5.0
-    def _generate_geo_recommendations(self, m: Dict[str, int]) -> List[str]:
-        r = []
-        if m['questions'] == 0:
-            r.append("Add FAQ section or question-based headings.")
-        if m['headings'] < 2:
-            r.append("Use more structured headings.")
-        if m['lists'] == 0:
-            r.append("Include bullet points or numbered lists.")
-        if m['entities'] < 5:
-            r.append("Add named or topical entities.")
-        if m['questions'] / m['sentence_count'] < 0.1:
-            r.append("Transform statements into Q&A pairs.")
-        return r
-    def _clean_json_string(self, json_str: str) -> str:
-        json_str = json_str.replace("...", "")
-        json_str = re.sub(r",\s*([}\]])", r"\\1", json_str)
-        json_str = json_str.strip('`')
-        return json_str
-    def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]:
-        try:
-            start = response_text.find('{')
-            end = response_text.rfind('}') + 1
-            if start != -1 and end != -1:
-                json_str = self._clean_json_string(response_text[start:end])
-                return json.loads(json_str)
-            return {
-                'raw_response': response_text,
-                'parsing_error': 'No JSON structure found in response',
-                'geo_analysis': {},
-                'recommendations': []
-            }
-        except json.JSONDecodeError as e:
-            return {
-                'raw_response': response_text,
-                'parsing_error': f'JSON decode error: {str(e)}',
-                'geo_analysis': {},
-                'recommendations': []
-            }
-        except Exception as e:
-            return {
-                'raw_response': response_text,
-                'parsing_error': f'Unexpected error: {str(e)}',
-                'geo_analysis': {},
-                'recommendations': []
-            }
-    # Legacy support methods
-    def optimize_content(self, content: str, analyze_only: bool = False, include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]:
-        return self.optimize_content_with_rag(content, optimization_type, analyze_only)
-    def analyze_content_readability(self, content: str) -> Dict[str, Any]:
-        return self.analyze_geo_readability(content)

utils/parser.py DELETED Viewed

@@ -1,549 +0,0 @@
-"""
-Content Parsing Module
-Handles extraction of content from PDFs, text, and webpages
-"""
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-from typing import List, Dict, Any
-import time
-from langchain_community.document_loaders import PyPDFLoader
-from langchain.schema import Document
-class BaseParser:
-    """Base class for all content parsers"""
-    def __init__(self):
-        self.supported_formats = []
-    def parse(self, source: str) -> List[Document]:
-        """Parse content from source and return LangChain Documents"""
-        raise NotImplementedError("Subclasses must implement parse method")
-    def validate_source(self, source: str) -> bool:
-        """Validate if the source can be processed"""
-        return True
-class PDFParser(BaseParser):
-    """Parser for PDF documents"""
-    def __init__(self):
-        super().__init__()
-        self.supported_formats = ['.pdf']
-    def parse(self, pdf_path: str) -> List[Document]:
-        """
-        Parse PDF file and return list of Document objects
-        Args:
-            pdf_path (str): Path to the PDF file
-        Returns:
-            List[Document]: List of parsed documents with metadata
-        """
-        try:
-            loader = PyPDFLoader(pdf_path)
-            documents = loader.load_and_split()
-            # Add additional metadata
-            for i, doc in enumerate(documents):
-                doc.metadata.update({
-                    'source_type': 'pdf',
-                    'page_number': i + 1,
-                    'total_pages': len(documents),
-                    'parser': 'PDFParser'
-                })
-            return documents
-        except Exception as e:
-            raise Exception(f"Error parsing PDF: {str(e)}")
-    def get_pdf_metadata(self, pdf_path: str) -> Dict[str, Any]:
-        """Extract metadata from PDF file"""
-        try:
-            loader = PyPDFLoader(pdf_path)
-            documents = loader.load()
-            total_pages = len(documents)
-            total_words = sum(len(doc.page_content.split()) for doc in documents)
-            return {
-                'total_pages': total_pages,
-                'total_words': total_words,
-                'average_words_per_page': total_words / total_pages if total_pages > 0 else 0,
-                'file_type': 'PDF',
-                'parser_used': 'PyPDFLoader'
-            }
-        except Exception as e:
-            return {'error': f"Could not extract metadata: {str(e)}"}
-class TextParser(BaseParser):
-    """Parser for plain text content"""
-    def __init__(self):
-        super().__init__()
-        self.supported_formats = ['.txt', 'plain_text']
-        self.chunk_size = 1000  # Default chunk size for long texts
-    def parse(self, text_content: str, chunk_size: int = None) -> List[Document]:
-        """
-        Parse text content and return list of Document objects
-        Args:
-            text_content (str): Raw text content
-            chunk_size (int): Optional chunk size for splitting long texts
-        Returns:
-            List[Document]: List of documents, potentially chunked
-        """
-        try:
-            if not text_content.strip():
-                raise ValueError("Empty text content provided")
-            chunk_size = chunk_size or self.chunk_size
-            # If text is short, return as single document
-            if len(text_content) <= chunk_size:
-                doc = Document(
-                    page_content=text_content,
-                    metadata={
-                        'source_type': 'text',
-                        'word_count': len(text_content.split()),
-                        'char_count': len(text_content),
-                        'chunk_index': 0,
-                        'total_chunks': 1,
-                        'parser': 'TextParser'
-                    }
-                )
-                return [doc]
-            # Split long text into chunks
-            chunks = self._split_text_into_chunks(text_content, chunk_size)
-            documents = []
-            for i, chunk in enumerate(chunks):
-                doc = Document(
-                    page_content=chunk,
-                    metadata={
-                        'source_type': 'text',
-                        'word_count': len(chunk.split()),
-                        'char_count': len(chunk),
-                        'chunk_index': i,
-                        'total_chunks': len(chunks),
-                        'parser': 'TextParser'
-                    }
-                )
-                documents.append(doc)
-            return documents
-        except Exception as e:
-            raise Exception(f"Error parsing text: {str(e)}")
-    def _split_text_into_chunks(self, text: str, chunk_size: int) -> List[str]:
-        """Split text into chunks while preserving sentence boundaries"""
-        sentences = text.split('. ')
-        chunks = []
-        current_chunk = ""
-        for sentence in sentences:
-            # Add sentence to current chunk if it fits
-            test_chunk = current_chunk + sentence + ". "
-            if len(test_chunk) <= chunk_size:
-                current_chunk = test_chunk
-            else:
-                # Start new chunk if current chunk has content
-                if current_chunk.strip():
-                    chunks.append(current_chunk.strip())
-                current_chunk = sentence + ". "
-        # Add final chunk if it has content
-        if current_chunk.strip():
-            chunks.append(current_chunk.strip())
-        return chunks
-    def analyze_text_structure(self, text_content: str) -> Dict[str, Any]:
-        """Analyze the structure and characteristics of text content"""
-        try:
-            lines = text_content.split('\n')
-            words = text_content.split()
-            sentences = text_content.split('.')
-            # Count different elements
-            paragraphs = [p.strip() for p in text_content.split('\n\n') if p.strip()]
-            return {
-                'total_words': len(words),
-                'total_sentences': len([s for s in sentences if s.strip()]),
-                'total_lines': len(lines),
-                'total_paragraphs': len(paragraphs),
-                'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
-                'average_sentences_per_paragraph': len(sentences) / len(paragraphs) if paragraphs else 0,
-                'character_count': len(text_content),
-                'reading_time_minutes': len(words) / 200,  # Assuming 200 words per minute
-                'complexity_score': self._calculate_text_complexity(text_content)
-            }
-        except Exception as e:
-            return {'error': f"Could not analyze text structure: {str(e)}"}
-    def _calculate_text_complexity(self, text: str) -> float:
-        """Calculate a simple text complexity score"""
-        words = text.split()
-        sentences = [s for s in text.split('.') if s.strip()]
-        if not sentences:
-            return 0.0
-        # Average words per sentence (higher = more complex)
-        avg_words_per_sentence = len(words) / len(sentences)
-        # Average characters per word (higher = more complex)
-        avg_chars_per_word = sum(len(word) for word in words) / len(words) if words else 0
-        # Simple complexity score (normalized to 1-10 scale)
-        complexity = (avg_words_per_sentence * 0.1) + (avg_chars_per_word * 0.5)
-        return min(complexity, 10.0)
-class WebpageParser(BaseParser):
-    """Parser for web content"""
-    def __init__(self):
-        super().__init__()
-        self.supported_formats = ['http', 'https']
-        self.headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        }
-        self.timeout = 10
-        self.max_retries = 3
-    def parse_website(self, url: str, max_pages: int = 1, include_subpages: bool = False) -> List[Dict[str, Any]]:
-        """
-        Parse website content and return structured data
-        Args:
-            url (str): Website URL to parse
-            max_pages (int): Maximum number of pages to parse
-            include_subpages (bool): Whether to include subpages
-        Returns:
-            List[Dict]: List of page data with content and metadata
-        """
-        try:
-            pages_data = []
-            urls_to_process = [url]
-            processed_urls = set()
-            # If including subpages, find additional URLs
-            if include_subpages and max_pages > 1:
-                subpage_urls = self._find_subpages(url, max_pages - 1)
-                urls_to_process.extend(subpage_urls)
-            # Process each URL
-            for current_url in urls_to_process[:max_pages]:
-                if current_url in processed_urls:
-                    continue
-                page_data = self._parse_single_page(current_url)
-                if page_data:
-                    pages_data.append(page_data)
-                    processed_urls.add(current_url)
-                # Add small delay to be respectful
-                time.sleep(1)
-            return pages_data
-        except Exception as e:
-            raise Exception(f"Error parsing website: {str(e)}")
-    def _parse_single_page(self, url: str) -> Dict[str, Any]:
-        """Parse a single webpage and extract content"""
-        try:
-            # Make request with retries
-            response = None
-            for attempt in range(self.max_retries):
-                try:
-                    response = requests.get(url, headers=self.headers, timeout=self.timeout)
-                    response.raise_for_status()
-                    break
-                except requests.RequestException as e:
-                    if attempt == self.max_retries - 1:
-                        raise e
-                    time.sleep(2 ** attempt)  # Exponential backoff
-            if not response:
-                return None
-            # Parse HTML content
-            soup = BeautifulSoup(response.content, 'html.parser')
-            # Remove unwanted elements
-            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
-                element.decompose()
-            # Extract main content
-            main_content = self._extract_main_content(soup)
-            # Extract metadata
-            title = self._extract_title(soup)
-            description = self._extract_description(soup)
-            headings = self._extract_headings(soup)
-            links = self._extract_links(soup, url)
-            # Clean and process text
-            cleaned_text = self._clean_text_content(main_content)
-            return {
-                'url': url,
-                'title': title,
-                'description': description,
-                'content': cleaned_text,
-                'headings': headings,
-                'internal_links': links['internal'],
-                'external_links': links['external'],
-                'word_count': len(cleaned_text.split()),
-                'char_count': len(cleaned_text),
-                'meta_keywords': self._extract_meta_keywords(soup),
-                'images': self._extract_images(soup, url),
-                'parser': 'WebpageParser',
-                'parsed_at': time.strftime('%Y-%m-%d %H:%M:%S')
-            }
-        except Exception as e:
-            return {'url': url, 'error': f"Failed to parse page: {str(e)}"}
-    def _extract_main_content(self, soup: BeautifulSoup) -> str:
-        """Extract the main content from the page"""
-        # Try to find main content in order of preference
-        content_selectors = [
-            'main',
-            'article',
-            '[role="main"]',
-            '.content',
-            '.main-content',
-            '#content',
-            '#main',
-            '.post-content',
-            '.entry-content'
-        ]
-        for selector in content_selectors:
-            element = soup.select_one(selector)
-            if element:
-                return element.get_text(separator=' ', strip=True)
-        # Fallback to body content
-        body = soup.find('body')
-        if body:
-            return body.get_text(separator=' ', strip=True)
-        return soup.get_text(separator=' ', strip=True)
-    def _extract_title(self, soup: BeautifulSoup) -> str:
-        """Extract page title"""
-        title_tag = soup.find('title')
-        if title_tag:
-            return title_tag.get_text().strip()
-        # Fallback to h1
-        h1 = soup.find('h1')
-        if h1:
-            return h1.get_text().strip()
-        return "No Title Found"
-    def _extract_description(self, soup: BeautifulSoup) -> str:
-        """Extract meta description"""
-        meta_desc = soup.find('meta', attrs={'name': 'description'})
-        if meta_desc and meta_desc.get('content'):
-            return meta_desc['content'].strip()
-        # Fallback to Open Graph description
-        og_desc = soup.find('meta', attrs={'property': 'og:description'})
-        if og_desc and og_desc.get('content'):
-            return og_desc['content'].strip()
-        return "No Description Found"
-    def _extract_headings(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
-        """Extract all headings with their hierarchy"""
-        headings = []
-        for i in range(1, 7):  # h1 to h6
-            for heading in soup.find_all(f'h{i}'):
-                text = heading.get_text(strip=True)
-                if text:
-                    headings.append({
-                        'level': i,
-                        'text': text,
-                        'id': heading.get('id', ''),
-                        'class': heading.get('class', [])
-                    })
-        return headings
-    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[str]]:
-        """Extract internal and external links"""
-        internal_links = []
-        external_links = []
-        base_domain = urlparse(base_url).netloc
-        for link in soup.find_all('a', href=True):
-            href = link['href']
-            full_url = urljoin(base_url, href)
-            parsed_url = urlparse(full_url)
-            if parsed_url.netloc == base_domain:
-                internal_links.append(full_url)
-            elif parsed_url.netloc:  # External link with domain
-                external_links.append(full_url)
-        return {
-            'internal': list(set(internal_links)),
-            'external': list(set(external_links))
-        }
-    def _extract_meta_keywords(self, soup: BeautifulSoup) -> List[str]:
-        """Extract meta keywords if available"""
-        meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
-        if meta_keywords and meta_keywords.get('content'):
-            keywords = meta_keywords['content'].split(',')
-            return [kw.strip() for kw in keywords if kw.strip()]
-        return []
-    def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str, str]]:
-        """Extract image information"""
-        images = []
-        for img in soup.find_all('img'):
-            src = img.get('src')
-            if src:
-                full_url = urljoin(base_url, src)
-                images.append({
-                    'src': full_url,
-                    'alt': img.get('alt', ''),
-                    'title': img.get('title', '')
-                })
-        return images
-    def _clean_text_content(self, text: str) -> str:
-        """Clean and normalize text content"""
-        if not text:
-            return ""
-        # Split into lines and clean each line
-        lines = text.split('\n')
-        cleaned_lines = []
-        for line in lines:
-            line = line.strip()
-            if line and len(line) > 1:  # Skip empty lines and single characters
-                cleaned_lines.append(line)
-        # Join lines with single spaces
-        cleaned_text = ' '.join(cleaned_lines)
-        # Remove multiple spaces
-        while '  ' in cleaned_text:
-            cleaned_text = cleaned_text.replace('  ', ' ')
-        return cleaned_text
-    def _find_subpages(self, url: str, max_subpages: int) -> List[str]:
-        """Find subpages from the main page"""
-        try:
-            response = requests.get(url, headers=self.headers, timeout=self.timeout)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.content, 'html.parser')
-            base_domain = urlparse(url).netloc
-            subpages = set()
-            # Find internal links
-            for link in soup.find_all('a', href=True):
-                href = link['href']
-                full_url = urljoin(url, href)
-                parsed_url = urlparse(full_url)
-                # Only include internal links from same domain
-                if (parsed_url.netloc == base_domain and
-                    full_url != url and
-                    not any(ext in full_url.lower() for ext in ['.pdf', '.jpg', '.png', '.gif', '.zip'])):
-                    subpages.add(full_url)
-                if len(subpages) >= max_subpages:
-                    break
-            return list(subpages)[:max_subpages]
-        except Exception:
-            return []
-    def validate_url(self, url: str) -> bool:
-        """Validate if URL is accessible"""
-        try:
-            response = requests.head(url, headers=self.headers, timeout=5)
-            return response.status_code == 200
-        except:
-            return False
-    def get_website_info(self, url: str) -> Dict[str, Any]:
-        """Get basic information about a website"""
-        try:
-            response = requests.get(url, headers=self.headers, timeout=self.timeout)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.content, 'html.parser')
-            return {
-                'url': url,
-                'title': self._extract_title(soup),
-                'description': self._extract_description(soup),
-                'meta_keywords': self._extract_meta_keywords(soup),
-                'has_robots_meta': bool(soup.find('meta', attrs={'name': 'robots'})),
-                'has_viewport_meta': bool(soup.find('meta', attrs={'name': 'viewport'})),
-                'language': soup.get('lang', 'unknown'),
-                'status_code': response.status_code,
-                'content_type': response.headers.get('content-type', 'unknown'),
-                'server': response.headers.get('server', 'unknown')
-            }
-        except Exception as e:
-            return {'url': url, 'error': f"Could not get website info: {str(e)}"}
-class ParserFactory:
-    """Factory class to create appropriate parsers"""
-    @staticmethod
-    def get_parser(source_type: str):
-        """Get the appropriate parser for the source type"""
-        parsers = {
-            'pdf': PDFParser(),
-            'text': TextParser(),
-            'webpage': WebpageParser(),
-            'url': WebpageParser()
-        }
-        return parsers.get(source_type.lower())
-    @staticmethod
-    def detect_source_type(source: str) -> str:
-        """Detect the type of content source"""
-        if source.startswith(('http://', 'https://')):
-            return 'webpage'
-        elif source.endswith('.pdf'):
-            return 'pdf'
-        else:
-            return 'text'

utils/scorer.py DELETED Viewed

@@ -1,484 +0,0 @@
-"""
-GEO Scoring Module
-Analyzes content for Generative Engine Optimization (GEO) performance
-"""
-import json
-from typing import Dict, Any, List
-from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
-class GEOScorer:
-    """Main class for calculating GEO scores and analysis"""
-    def __init__(self, llm):
-        self.llm = llm
-        self.setup_prompts()
-    def setup_prompts(self):
-        """Initialize prompts for different types of analysis"""
-        # Main GEO analysis prompt
-        self.geo_analysis_prompt = (
-            "You are a Generative Engine Optimization (GEO) Specialist. Your task is to critically analyze the input content for its effectiveness in AI-powered search engines and large language model (LLM) systems. "
-            "Evaluate the content using the following GEO criteria, assigning a score from 1 to 10 for each: \n\n"
-            "1. AI Search Visibility - How likely is the content to be surfaced by AI search engines?\n"
-            "2. Query Intent Matching - How well does the content align with common user queries?\n"
-            "3. Factual Accuracy & Authority - How trustworthy and authoritative is the information?\n"
-            "4. Conversational Readiness - Is the content well-suited for AI chat responses?\n"
-            "5. Semantic Richness - Does the content effectively use relevant semantic keywords?\n"
-            "6. Context Completeness - Is the content self-contained and does it provide complete answers?\n"
-            "7. Citation Worthiness - How likely is the content to be cited by AI systems?\n"
-            "8. Multi-Query Coverage - Does the content address multiple related questions?\n\n"
-            "Also provide:\n"
-            "- Key topics and entities mentioned\n"
-            "- Missing information or content gaps\n"
-            "- Specific optimization opportunities\n"
-            "- Actionable enhancement recommendations\n\n"
-            "Respond strictly in JSON format using the structure below (double curly braces shown here to escape string formatting, do NOT include them in actual output):\n\n"
-            "{{\n"
-            "  \"geo_scores\": {{\n"
-            "    \"ai_search_visibility\": 0.0,\n"
-            "    \"query_intent_matching\": 0.0,\n"
-            "    \"factual_accuracy\": 0.0,\n"
-            "    \"conversational_readiness\": 0.0,\n"
-            "    \"semantic_richness\": 0.0,\n"
-            "    \"context_completeness\": 0.0,\n"
-            "    \"citation_worthiness\": 0.0,\n"
-            "    \"multi_query_coverage\": 0.0\n"
-            "  }},\n"
-            "  \"overall_geo_score\": 0.0,\n"
-            "  \"primary_topics\": [\"topic1\", \"topic2\"],\n"
-            "  \"entities\": [\"entity1\", \"entity2\"],\n"
-            "  \"missing_gaps\": [\"gap1\", \"gap2\"],\n"
-            "  \"optimization_opportunities\": [\n"
-            "    {{\n"
-            "      \"type\": \"semantic_enhancement\",\n"
-            "      \"description\": \"Describe the improvement opportunity\",\n"
-            "      \"priority\": \"high\"\n"
-            "    }}\n"
-            "  ],\n"
-            "  \"recommendations\": [\n"
-            "    \"Write clear and specific suggestions to improve the content\"\n"
-            "  ]\n"
-            "}}"
-        )
-        # Quick scoring prompt for faster analysis
-        self.quick_score_prompt = (
-            "You are an AI Search Optimization Analyst. Evaluate the given content and provide a quick scoring based on key criteria.\n"
-            "Rate each of the following from 1 to 10:\n"
-            "1. AI Search Visibility\n"
-            "2. Query Intent Matching\n"
-            "3. Conversational Readiness\n"
-            "4. Citation Worthiness\n\n"
-            "{{\n"
-            "  \"scores\": {{\n"
-            "    \"ai_search_visibility\": 0.0,\n"
-            "    \"query_intent_matching\": 0.0,\n"
-            "    \"conversational_readiness\": 0.0,\n"
-            "    \"citation_worthiness\": 0.0\n"
-            "  }},\n"
-            "  \"overall_score\": 0.0,\n"
-            "  \"top_recommendation\": \"Provide the most critical improvement needed\"\n"
-            "}}"
-        )
-        # Competitive analysis prompt
-        self.competitive_prompt = (
-            "Compare these content pieces for GEO performance. Identify which performs better for AI search and why.\n"
-            "Content A: {content_a}\n"
-            "Content B: {content_b}\n"
-            "Provide analysis in JSON:\n"
-            "{{\n"
-            "  \"winner\": \"A\" or \"B\",\n"
-            "  \"score_comparison\": {{\n"
-            "    \"content_a_score\": 7.5,\n"
-            "    \"content_b_score\": 8.2\n"
-            "  }},\n"
-            "  \"key_differences\": [\"difference1\", \"difference2\"],\n"
-            "  \"improvement_suggestions\": {{\n"
-            "    \"content_a\": [\"suggestion1\"],\n"
-            "    \"content_b\": [\"suggestion1\"]\n"
-            "  }}\n"
-            "}}"
-        )
-    def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
-        """
-        Analyze a single page for GEO performance
-        """
-        try:
-            # Choose prompt based on detail level
-            if detailed:
-                system_prompt = self.geo_analysis_prompt
-                user_message = f"Title: {title}\n\nContent: {content[:8000]}"
-            else:
-                system_prompt = self.quick_score_prompt
-                user_message = f"Title: {title}\n\nContent: {content[:4000]}"
-            # Build prompt and run analysis
-            prompt_template = ChatPromptTemplate.from_messages([
-                SystemMessagePromptTemplate.from_template(system_prompt),
-                HumanMessagePromptTemplate.from_template(user_message)
-            ])
-                # ("user", user_message)
-                # ("system", system_prompt),
-            chain = prompt_template | self.llm
-            result = chain.invoke({})  # No variables needed
-            # Extract and parse result
-            result_content = result.content if hasattr(result, 'content') else str(result)
-            parsed_result = self._parse_llm_response(result_content)
-            # Add metadata
-            parsed_result.update({
-                'analyzed_title': title,
-                'content_length': len(content),
-                'word_count': len(content.split()),
-                'analysis_type': 'detailed' if detailed else 'quick'
-            })
-            return parsed_result
-        except Exception as e:
-            return {'error': f"GEO analysis failed: {str(e)}"}
-    def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
-        """
-        Analyze multiple pages and return consolidated results
-        Args:
-            pages_data (List[Dict]): List of page data with content and metadata
-            detailed (bool): Whether to perform detailed analysis
-        Returns:
-            List[Dict]: List of GEO analysis results
-        """
-        results = []
-        for i, page_data in enumerate(pages_data):
-            try:
-                content = page_data.get('content', '')
-                title = page_data.get('title', f'Page {i+1}')
-                analysis = self.analyze_page_geo(content, title, detailed)
-                # Add page-specific metadata
-                analysis.update({
-                    'page_url': page_data.get('url', ''),
-                    'page_index': i,
-                    'source_word_count': page_data.get('word_count', 0)
-                })
-                results.append(analysis)
-            except Exception as e:
-                results.append({
-                    'page_index': i,
-                    'page_url': page_data.get('url', ''),
-                    'error': f"Analysis failed: {str(e)}"
-                })
-        return results
-    def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
-        """
-        Compare two pieces of content for GEO performance
-        Args:
-            content_a (str): First content to compare
-            content_b (str): Second content to compare
-            titles (tuple): Optional titles for the content pieces
-        Returns:
-            Dict: Comparison analysis results
-        """
-        try:
-            title_a, title_b = titles if titles else ("Content A", "Content B")
-            prompt_template = ChatPromptTemplate.from_messages([
-                ("system", self.competitive_prompt),
-                ("user", "")
-            ])
-            # Format the competitive analysis prompt
-            formatted_prompt = self.competitive_prompt.format(
-                content_a=f"Title: {title_a}\nContent: {content_a[:4000]}",
-                content_b=f"Title: {title_b}\nContent: {content_b[:4000]}"
-            )
-            chain = ChatPromptTemplate.from_messages([
-                ("system", formatted_prompt),
-                ("user", "Perform the comparison analysis.")
-            ]) | self.llm
-            result = chain.invoke({})
-            result_content = result.content if hasattr(result, 'content') else str(result)
-            return self._parse_llm_response(result_content)
-        except Exception as e:
-            return {'error': f"Comparison analysis failed: {str(e)}"}
-    def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """
-        Calculate aggregate GEO scores from multiple page analyses
-        Args:
-            individual_results (List[Dict]): List of individual page analysis results
-        Returns:
-            Dict: Aggregate scores and insights
-        """
-        try:
-            valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
-            if not valid_results:
-                return {'error': 'No valid results to aggregate'}
-            # Calculate average scores
-            score_keys = list(valid_results[0]['geo_scores'].keys())
-            avg_scores = {}
-            for key in score_keys:
-                scores = [r['geo_scores'][key] for r in valid_results if key in r['geo_scores']]
-                avg_scores[key] = sum(scores) / len(scores) if scores else 0
-            overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
-            # Collect all recommendations and opportunities
-            all_recommendations = []
-            all_opportunities = []
-            all_topics = []
-            all_entities = []
-            for result in valid_results:
-                all_recommendations.extend(result.get('recommendations', []))
-                all_opportunities.extend(result.get('optimization_opportunities', []))
-                all_topics.extend(result.get('primary_topics', []))
-                all_entities.extend(result.get('entities', []))
-            # Remove duplicates and prioritize
-            unique_recommendations = list(set(all_recommendations))
-            unique_topics = list(set(all_topics))
-            unique_entities = list(set(all_entities))
-            # Find highest and lowest performing areas
-            best_score = max(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
-            worst_score = min(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
-            return {
-                'aggregate_scores': avg_scores,
-                'overall_score': overall_avg,
-                'pages_analyzed': len(valid_results),
-                'best_performing_metric': {
-                    'metric': best_score[0],
-                    'score': best_score[1]
-                },
-                'lowest_performing_metric': {
-                    'metric': worst_score[0],
-                    'score': worst_score[1]
-                },
-                'consolidated_recommendations': unique_recommendations[:10],
-                'all_topics': unique_topics,
-                'all_entities': unique_entities,
-                'high_priority_opportunities': [
-                    opp for opp in all_opportunities
-                    if opp.get('priority') == 'high'
-                ][:5],
-                'score_distribution': self._calculate_score_distribution(avg_scores)
-            }
-        except Exception as e:
-            return {'error': f"Aggregation failed: {str(e)}"}
-    def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
-        """
-        Generate a comprehensive GEO report
-        Args:
-            analysis_results (Dict): Results from aggregate analysis
-            website_url (str): Optional website URL for context
-        Returns:
-            Dict: Comprehensive GEO report
-        """
-        try:
-            report = {
-                'report_metadata': {
-                    'generated_at': self._get_timestamp(),
-                    'website_url': website_url,
-                    'analysis_type': 'GEO Performance Report'
-                },
-                'executive_summary': self._generate_executive_summary(analysis_results),
-                'detailed_scores': analysis_results.get('aggregate_scores', {}),
-                'performance_insights': self._generate_performance_insights(analysis_results),
-                'actionable_recommendations': self._prioritize_recommendations(
-                    analysis_results.get('consolidated_recommendations', [])
-                ),
-                'optimization_roadmap': self._create_optimization_roadmap(analysis_results),
-                'competitive_position': self._assess_competitive_position(analysis_results),
-                'technical_details': {
-                    'pages_analyzed': analysis_results.get('pages_analyzed', 0),
-                    'overall_score': analysis_results.get('overall_score', 0),
-                    'score_distribution': analysis_results.get('score_distribution', {})
-                }
-            }
-            return report
-        except Exception as e:
-            return {'error': f"Report generation failed: {str(e)}"}
-    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
-        """Parse LLM response and extract JSON content"""
-        try:
-            # Find JSON content in the response
-            json_start = response_text.find('{')
-            json_end = response_text.rfind('}') + 1
-            if json_start != -1 and json_end != -1:
-                json_str = response_text[json_start:json_end]
-                return json.loads(json_str)
-            else:
-                # If no JSON found, return the raw response
-                return {'raw_response': response_text, 'parsing_error': 'No JSON found'}
-        except json.JSONDecodeError as e:
-            return {'raw_response': response_text, 'parsing_error': f'JSON decode error: {str(e)}'}
-        except Exception as e:
-            return {'raw_response': response_text, 'parsing_error': f'Unexpected error: {str(e)}'}
-    def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
-        """Calculate distribution of scores for insights"""
-        if not scores:
-            return {}
-        score_values = list(scores.values())
-        return {
-            'highest_score': max(score_values),
-            'lowest_score': min(score_values),
-            'average_score': sum(score_values) / len(score_values),
-            'score_range': max(score_values) - min(score_values),
-            'scores_above_7': len([s for s in score_values if s >= 7.0]),
-            'scores_below_5': len([s for s in score_values if s < 5.0])
-        }
-    def _generate_executive_summary(self, analysis_results: Dict[str, Any]) -> str:
-        """Generate executive summary based on analysis results"""
-        overall_score = analysis_results.get('overall_score', 0)
-        pages_analyzed = analysis_results.get('pages_analyzed', 0)
-        if overall_score >= 8.0:
-            performance = "excellent"
-        elif overall_score >= 6.5:
-            performance = "good"
-        elif overall_score >= 5.0:
-            performance = "moderate"
-        else:
-            performance = "needs improvement"
-        return f"Analysis of {pages_analyzed} pages shows {performance} GEO performance with an overall score of {overall_score:.1f}/10. Key opportunities exist in {analysis_results.get('lowest_performing_metric', {}).get('metric', 'multiple areas')}."
-    def _generate_performance_insights(self, analysis_results: Dict[str, Any]) -> List[str]:
-        """Generate performance insights based on analysis"""
-        insights = []
-        best_metric = analysis_results.get('best_performing_metric', {})
-        worst_metric = analysis_results.get('lowest_performing_metric', {})
-        if best_metric.get('score', 0) >= 8.0:
-            insights.append(f"Strong performance in {best_metric.get('metric', 'unknown')} (score: {best_metric.get('score', 0):.1f})")
-        if worst_metric.get('score', 10) < 6.0:
-            insights.append(f"Significant improvement needed in {worst_metric.get('metric', 'unknown')} (score: {worst_metric.get('score', 0):.1f})")
-        score_dist = analysis_results.get('score_distribution', {})
-        if score_dist.get('score_range', 0) > 3.0:
-            insights.append("High variability in scores indicates inconsistent optimization across metrics")
-        return insights
-    def _prioritize_recommendations(self, recommendations: List[str]) -> List[Dict[str, Any]]:
-        """Prioritize recommendations based on impact potential"""
-        prioritized = []
-        # Simple prioritization based on keywords
-        high_impact_keywords = ['semantic', 'structure', 'authority', 'factual']
-        medium_impact_keywords = ['readability', 'clarity', 'format']
-        for i, rec in enumerate(recommendations):
-            priority = 'low'
-            if any(keyword in rec.lower() for keyword in high_impact_keywords):
-                priority = 'high'
-            elif any(keyword in rec.lower() for keyword in medium_impact_keywords):
-                priority = 'medium'
-            prioritized.append({
-                'recommendation': rec,
-                'priority': priority,
-                'order': i + 1
-            })
-        # Sort by priority
-        priority_order = {'high': 1, 'medium': 2, 'low': 3}
-        prioritized.sort(key=lambda x: priority_order[x['priority']])
-        return prioritized
-    def _create_optimization_roadmap(self, analysis_results: Dict[str, Any]) -> Dict[str, List[str]]:
-        """Create a phased optimization roadmap"""
-        roadmap = {
-            'immediate_actions': [],
-            'short_term_goals': [],
-            'long_term_strategy': []
-        }
-        overall_score = analysis_results.get('overall_score', 0)
-        worst_metric = analysis_results.get('lowest_performing_metric', {})
-        # Immediate actions based on worst performing metric
-        if worst_metric.get('score', 10) < 5.0:
-            roadmap['immediate_actions'].append(f"Address critical issues in {worst_metric.get('metric', 'low-scoring areas')}")
-        # Short-term goals
-        if overall_score < 7.0:
-            roadmap['short_term_goals'].append("Improve overall GEO score to above 7.0")
-            roadmap['short_term_goals'].append("Enhance content structure and semantic richness")
-        # Long-term strategy
-        roadmap['long_term_strategy'].append("Establish consistent GEO optimization process")
-        roadmap['long_term_strategy'].append("Monitor and track AI search performance")
-        return roadmap
-    def _assess_competitive_position(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
-        """Assess competitive position based on scores"""
-        overall_score = analysis_results.get('overall_score', 0)
-        if overall_score >= 8.5:
-            position = "market_leader"
-            description = "Content is highly optimized for AI search engines"
-        elif overall_score >= 7.0:
-            position = "competitive"
-            description = "Content performs well but has room for improvement"
-        elif overall_score >= 5.5:
-            position = "average"
-            description = "Content meets basic standards but lacks optimization"
-        else:
-            position = "needs_work"
-            description = "Content requires significant optimization for AI search"
-        return {
-            'position': position,
-            'description': description,
-            'score': overall_score,
-            'percentile_estimate': min(overall_score * 10, 100)  # Rough percentile estimate
-        }
-    def _get_timestamp(self) -> str:
-        """Get current timestamp"""
-        from datetime import datetime
-        return datetime.now().strftime('%Y-%m-%d %H:%M:%S')