Spaces:

MBilal-72
/

GenerativeEngineOptimization

Runtime error

App Files Files Community

MBilal-72 commited on Jul 26, 2025

Commit

c76bc58

verified ·

1 Parent(s): 67e514b

Upload Utils and its files

Browse files

Files changed (5) hide show

utils/chunker.py +1314 -0
utils/export.py +1896 -0
utils/optimizer.py +558 -0
utils/parser.py +549 -0
utils/scorer.py +501 -0

utils/chunker.py ADDED Viewed

	@@ -0,0 +1,1314 @@

+"""
+Vector Chunking and RAG Module
+Handles document chunking, vector embeddings, and RAG question-answering
+"""
+import os
+import json
+import numpy as np
+from typing import Dict, Any, List, Optional, Tuple
+from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
+from langchain.schema import Document
+from langchain_community.vectorstores import FAISS, Chroma
+from langchain.chains import RetrievalQA, ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from langchain.prompts import PromptTemplate
+import tempfile
+import shutil
+class VectorChunker:
+    """Main class for document chunking and vector operations"""
+    def __init__(self, embeddings_model, chunk_size: int = 1000, chunk_overlap: int = 200):
+        self.embeddings = embeddings_model
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.setup_text_splitters()
+        self.vector_stores = {}  # Cache for vector stores
+    def setup_text_splitters(self):
+        """Initialize different text splitting strategies"""
+        # Default recursive splitter
+        self.recursive_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+        # Character-based splitter
+        self.character_splitter = CharacterTextSplitter(
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            separator="\n\n"
+        )
+        # Semantic splitter for better context preservation
+        self.semantic_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=800,  # Smaller chunks for better semantic coherence
+            chunk_overlap=150,
+            length_function=len,
+            separators=["\n\n", "\n", ". ", " ", ""]
+        )
+    def chunk_documents(self, documents: List[Document], strategy: str = "recursive") -> List[Document]:
+        """
+        Chunk documents using specified strategy
+        Args:
+            documents (List[Document]): List of documents to chunk
+            strategy (str): Chunking strategy ("recursive", "character", "semantic")
+        Returns:
+            List[Document]: List of chunked documents
+        """
+        try:
+            # Choose splitter based on strategy
+            if strategy == "character":
+                splitter = self.character_splitter
+            elif strategy == "semantic":
+                splitter = self.semantic_splitter
+            else:
+                splitter = self.recursive_splitter
+            # Split documents
+            chunked_docs = []
+            for doc in documents:
+                chunks = splitter.split_documents([doc])
+                # Add chunk metadata
+                for i, chunk in enumerate(chunks):
+                    chunk.metadata.update({
+                        'chunk_index': i,
+                        'total_chunks': len(chunks),
+                        'chunk_strategy': strategy,
+                        'original_source': doc.metadata.get('source', 'unknown'),
+                        'chunk_size': len(chunk.page_content),
+                        'chunk_word_count': len(chunk.page_content.split())
+                    })
+                chunked_docs.extend(chunks)
+            return chunked_docs
+        except Exception as e:
+            raise Exception(f"Document chunking failed: {str(e)}")
+    def create_vector_store(self, documents: List[Document], store_type: str = "faiss",
+                           persist_directory: Optional[str] = None) -> Any:
+        """
+        Create vector store from documents
+        Args:
+            documents (List[Document]): Documents to vectorize
+            store_type (str): Type of vector store ("faiss", "chroma")
+            persist_directory (str): Optional directory to persist the store
+        Returns:
+            Vector store instance
+        """
+        try:
+            if not documents:
+                raise ValueError("No documents provided for vector store creation")
+            if store_type.lower() == "chroma":
+                if persist_directory:
+                    vector_store = Chroma.from_documents(
+                        documents=documents,
+                        embedding=self.embeddings,
+                        persist_directory=persist_directory
+                    )
+                    vector_store.persist()
+                else:
+                    vector_store = Chroma.from_documents(
+                        documents=documents,
+                        embedding=self.embeddings
+                    )
+            else:  # Default to FAISS
+                vector_store = FAISS.from_documents(
+                    documents=documents,
+                    embedding=self.embeddings
+                )
+                # Save FAISS index if persist directory provided
+                if persist_directory:
+                    os.makedirs(persist_directory, exist_ok=True)
+                    vector_store.save_local(persist_directory)
+            return vector_store
+        except Exception as e:
+            raise Exception(f"Vector store creation failed: {str(e)}")
+    def create_qa_chain(self, documents: List[Document], llm, chain_type: str = "stuff") -> RetrievalQA:
+        """
+        Create a Question-Answering chain from documents
+        Args:
+            documents (List[Document]): Documents for the knowledge base
+            llm: Language model for answering questions
+            chain_type (str): Type of QA chain ("stuff", "map_reduce", "refine")
+        Returns:
+            RetrievalQA: Configured QA chain
+        """
+        try:
+            # Chunk documents
+            chunked_docs = self.chunk_documents(documents, strategy="semantic")
+            # Create vector store
+            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
+            # Create retriever
+            retriever = vector_store.as_retriever(
+                search_type="similarity",
+                search_kwargs={"k": 4}  # Retrieve top 4 most relevant chunks
+            )
+            # Custom prompt for GEO-focused QA
+            qa_prompt_template = """Use the following pieces of context to answer the question at the end.
+If you don't know the answer, just say that you don't know, don't try to make up an answer.
+Focus on providing clear, accurate, and complete answers that would be suitable for AI search engines.
+Context:
+{context}
+Question: {question}
+Answer:"""
+            qa_prompt = PromptTemplate(
+                template=qa_prompt_template,
+                input_variables=["context", "question"]
+            )
+            # Create QA chain
+            qa_chain = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type=chain_type,
+                retriever=retriever,
+                return_source_documents=True,
+                chain_type_kwargs={"prompt": qa_prompt}
+            )
+            return qa_chain
+        except Exception as e:
+            raise Exception(f"QA chain creation failed: {str(e)}")
+    def create_conversational_chain(self, documents: List[Document], llm) -> ConversationalRetrievalChain:
+        """
+        Create a conversational retrieval chain with memory
+        Args:
+            documents (List[Document]): Documents for the knowledge base
+            llm: Language model for conversation
+        Returns:
+            ConversationalRetrievalChain: Configured conversational chain
+        """
+        try:
+            # Chunk documents
+            chunked_docs = self.chunk_documents(documents, strategy="semantic")
+            # Create vector store
+            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
+            # Create retriever
+            retriever = vector_store.as_retriever(
+                search_type="similarity",
+                search_kwargs={"k": 3}
+            )
+            # Create memory
+            memory = ConversationBufferMemory(
+                memory_key="chat_history",
+                return_messages=True,
+                output_key="answer"
+            )
+            # Custom prompt for conversational QA
+            condense_question_prompt = """Given the following conversation and a follow up question,
+rephrase the follow up question to be a standalone question that can be understood without the chat history.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:"""
+            # Create conversational chain
+            conv_chain = ConversationalRetrievalChain.from_llm(
+                llm=llm,
+                retriever=retriever,
+                memory=memory,
+                return_source_documents=True,
+                condense_question_prompt=PromptTemplate.from_template(condense_question_prompt)
+            )
+            return conv_chain
+        except Exception as e:
+            raise Exception(f"Conversational chain creation failed: {str(e)}")
+    def semantic_search(self, query: str, documents: List[Document], top_k: int = 5) -> List[Dict[str, Any]]:
+        """
+        Perform semantic search on documents
+        Args:
+            query (str): Search query
+            documents (List[Document]): Documents to search
+            top_k (int): Number of top results to return
+        Returns:
+            List[Dict]: Search results with scores
+        """
+        try:
+            # Chunk documents
+            chunked_docs = self.chunk_documents(documents, strategy="semantic")
+            # Create vector store
+            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
+            # Perform similarity search with scores
+            results = vector_store.similarity_search_with_score(query, k=top_k)
+            # Format results
+            formatted_results = []
+            for doc, score in results:
+                result = {
+                    'content': doc.page_content,
+                    'metadata': doc.metadata,
+                    'similarity_score': float(score),
+                    'relevance_rank': len(formatted_results) + 1
+                }
+                formatted_results.append(result)
+            return formatted_results
+        except Exception as e:
+            raise Exception(f"Semantic search failed: {str(e)}")
+    def analyze_document_similarity(self, documents: List[Document]) -> Dict[str, Any]:
+        """
+        Analyze similarity between documents
+        Args:
+            documents (List[Document]): Documents to analyze
+        Returns:
+            Dict: Similarity analysis results
+        """
+        try:
+            if len(documents) < 2:
+                return {'error': 'Need at least 2 documents for similarity analysis'}
+            # Chunk documents
+            chunked_docs = self.chunk_documents(documents, strategy="semantic")
+            # Create embeddings for each document
+            doc_embeddings = []
+            doc_metadata = []
+            for doc in chunked_docs:
+                # Get embedding for the document
+                embedding = self.embeddings.embed_query(doc.page_content)
+                doc_embeddings.append(embedding)
+                doc_metadata.append({
+                    'content_preview': doc.page_content[:200] + "...",
+                    'metadata': doc.metadata,
+                    'length': len(doc.page_content)
+                })
+            # Calculate pairwise similarities
+            similarities = []
+            embeddings_array = np.array(doc_embeddings)
+            for i in range(len(embeddings_array)):
+                for j in range(i + 1, len(embeddings_array)):
+                    # Calculate cosine similarity
+                    similarity = np.dot(embeddings_array[i], embeddings_array[j]) / (
+                        np.linalg.norm(embeddings_array[i]) * np.linalg.norm(embeddings_array[j])
+                    )
+                    similarities.append({
+                        'doc_1_index': i,
+                        'doc_2_index': j,
+                        'similarity_score': float(similarity),
+                        'doc_1_preview': doc_metadata[i]['content_preview'],
+                        'doc_2_preview': doc_metadata[j]['content_preview']
+                    })
+            # Sort by similarity score
+            similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
+            # Calculate statistics
+            similarity_scores = [s['similarity_score'] for s in similarities]
+            return {
+                'total_comparisons': len(similarities),
+                'average_similarity': np.mean(similarity_scores),
+                'max_similarity': max(similarity_scores),
+                'min_similarity': min(similarity_scores),
+                'similarity_distribution': {
+                    'high_similarity': len([s for s in similarity_scores if s > 0.8]),
+                    'medium_similarity': len([s for s in similarity_scores if 0.5 < s <= 0.8]),
+                    'low_similarity': len([s for s in similarity_scores if s <= 0.5])
+                },
+                'top_similar_pairs': similarities[:5],
+                'most_dissimilar_pairs': similarities[-3:]
+            }
+        except Exception as e:
+            return {'error': f"Similarity analysis failed: {str(e)}"}
+    def extract_key_passages(self, documents: List[Document], queries: List[str],
+                           passages_per_query: int = 3) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Extract key passages from documents based on multiple queries
+        Args:
+            documents (List[Document]): Documents to search
+            queries (List[str]): List of queries to search for
+            passages_per_query (int): Number of passages to extract per query
+        Returns:
+            Dict: Key passages organized by query
+        """
+        try:
+            # Chunk documents
+            chunked_docs = self.chunk_documents(documents, strategy="semantic")
+            # Create vector store
+            vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
+            key_passages = {}
+            for query in queries:
+                # Search for relevant passages
+                results = vector_store.similarity_search_with_score(query, k=passages_per_query)
+                passages = []
+                for doc, score in results:
+                    passage = {
+                        'content': doc.page_content,
+                        'relevance_score': float(score),
+                        'metadata': doc.metadata,
+                        'word_count': len(doc.page_content.split()),
+                        'query_match': query
+                    }
+                    passages.append(passage)
+                key_passages[query] = passages
+            return key_passages
+        except Exception as e:
+            return {'error': f"Key passage extraction failed: {str(e)}"}
+    def optimize_chunking_strategy(self, documents: List[Document],
+                                  test_queries: List[str]) -> Dict[str, Any]:
+        """
+        Test different chunking strategies and recommend the best one
+        Args:
+            documents (List[Document]): Documents to test
+            test_queries (List[str]): Queries to test retrieval performance
+        Returns:
+            Dict: Optimization results and recommendations
+        """
+        try:
+            strategies = ["recursive", "character", "semantic"]
+            strategy_results = {}
+            for strategy in strategies:
+                try:
+                    # Test this strategy
+                    chunked_docs = self.chunk_documents(documents, strategy=strategy)
+                    vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
+                    # Test retrieval performance
+                    retrieval_scores = []
+                    for query in test_queries:
+                        results = vector_store.similarity_search_with_score(query, k=3)
+                        # Calculate average relevance score
+                        if results:
+                            avg_score = sum(score for _, score in results) / len(results)
+                            retrieval_scores.append(float(avg_score))
+                    # Calculate strategy metrics
+                    avg_retrieval_score = np.mean(retrieval_scores) if retrieval_scores else 0
+                    total_chunks = len(chunked_docs)
+                    avg_chunk_size = np.mean([len(doc.page_content) for doc in chunked_docs])
+                    strategy_results[strategy] = {
+                        'average_retrieval_score': avg_retrieval_score,
+                        'total_chunks': total_chunks,
+                        'average_chunk_size': avg_chunk_size,
+                        'retrieval_scores': retrieval_scores,
+                        'chunk_size_distribution': {
+                            'min': min(len(doc.page_content) for doc in chunked_docs),
+                            'max': max(len(doc.page_content) for doc in chunked_docs),
+                            'std': float(np.std([len(doc.page_content) for doc in chunked_docs]))
+                        }
+                    }
+                except Exception as e:
+                    strategy_results[strategy] = {'error': f"Strategy test failed: {str(e)}"}
+            # Determine best strategy
+            valid_strategies = {k: v for k, v in strategy_results.items() if 'error' not in v}
+            if valid_strategies:
+                best_strategy = max(valid_strategies.keys(),
+                                  key=lambda k: valid_strategies[k]['average_retrieval_score'])
+                recommendation = {
+                    'recommended_strategy': best_strategy,
+                    'reason': f"Best average retrieval score: {valid_strategies[best_strategy]['average_retrieval_score']:.4f}",
+                    'all_results': strategy_results,
+                    'performance_summary': {
+                        strategy: result.get('average_retrieval_score', 0)
+                        for strategy, result in valid_strategies.items()
+                    }
+                }
+            else:
+                recommendation = {
+                    'recommended_strategy': 'recursive',  # Default fallback
+                    'reason': 'All strategies failed, using default',
+                    'all_results': strategy_results
+                }
+            return recommendation
+        except Exception as e:
+            return {'error': f"Chunking optimization failed: {str(e)}"}
+    def create_document_summary(self, documents: List[Document], llm,
+                               summary_type: str = "extractive") -> Dict[str, Any]:
+        """
+        Create document summaries using the chunked content
+        Args:
+            documents (List[Document]): Documents to summarize
+            llm: Language model for summarization
+            summary_type (str): Type of summary ("extractive", "abstractive")
+        Returns:
+            Dict: Summary results
+        """
+        try:
+            # Chunk documents for better processing
+            chunked_docs = self.chunk_documents(documents, strategy="semantic")
+            if summary_type == "extractive":
+                # Extract key sentences/chunks
+                return self._create_extractive_summary(chunked_docs)
+            else:
+                # Generate abstractive summary using LLM
+                return self._create_abstractive_summary(chunked_docs, llm)
+        except Exception as e:
+            return {'error': f"Document summarization failed: {str(e)}"}
+    def _create_extractive_summary(self, chunked_docs: List[Document]) -> Dict[str, Any]:
+        """Create extractive summary by selecting key chunks"""
+        try:
+            # Simple extractive approach: select chunks with highest semantic density
+            chunk_scores = []
+            for doc in chunked_docs:
+                content = doc.page_content
+                # Simple scoring based on content characteristics
+                word_count = len(content.split())
+                sentence_count = len([s for s in content.split('.') if s.strip()])
+                # Score based on information density
+                density_score = word_count / max(sentence_count, 1)
+                # Bonus for chunks with questions, definitions, or lists
+                structure_bonus = 0
+                if '?' in content:
+                    structure_bonus += 1
+                if any(word in content.lower() for word in ['define', 'definition', 'means', 'refers to']):
+                    structure_bonus += 2
+                if content.count('\n•') > 0 or content.count('1.') > 0:
+                    structure_bonus += 1
+                total_score = density_score + structure_bonus
+                chunk_scores.append((doc, total_score))
+            # Sort by score and select top chunks for summary
+            chunk_scores.sort(key=lambda x: x[1], reverse=True)
+            top_chunks = chunk_scores[:min(5, len(chunk_scores))]
+            summary_content = []
+            for doc, score in top_chunks:
+                summary_content.append({
+                    'content': doc.page_content,
+                    'score': score,
+                    'metadata': doc.metadata
+                })
+            return {
+                'summary_type': 'extractive',
+                'key_chunks': summary_content,
+                'total_chunks_analyzed': len(chunked_docs),
+                'chunks_selected': len(top_chunks)
+            }
+        except Exception as e:
+            return {'error': f"Extractive summary failed: {str(e)}"}
+    def _create_abstractive_summary(self, chunked_docs: List[Document], llm) -> Dict[str, Any]:
+        """Create abstractive summary using language model"""
+        try:
+            # Combine content from top chunks
+            combined_content = "\n\n".join([doc.page_content for doc in chunked_docs[:10]])
+            summary_prompt = f"""Please provide a comprehensive summary of the following content.
+Focus on the main topics, key insights, and important details that would be valuable for AI search engines.
+Content:
+{combined_content[:5000]}
+Summary:"""
+            from langchain.prompts import ChatPromptTemplate
+            prompt_template = ChatPromptTemplate.from_messages([
+                ("system", "You are a professional content summarizer. Create clear, informative summaries."),
+                ("user", summary_prompt)
+            ])
+            chain = prompt_template | llm
+            result = chain.invoke({})
+            summary_text = result.content if hasattr(result, 'content') else str(result)
+            return {
+                'summary_type': 'abstractive',
+                'summary': summary_text,
+                'source_chunks': len(chunked_docs),
+                'content_length_processed': len(combined_content)
+            }
+        except Exception as e:
+            return {'error': f"Abstractive summary failed: {str(e)}"}
+    def save_vector_store(self, vector_store, directory_path: str, store_type: str = "faiss") -> bool:
+        """
+        Save vector store to disk
+        Args:
+            vector_store: Vector store instance to save
+            directory_path (str): Directory to save the store
+            store_type (str): Type of vector store
+        Returns:
+            bool: Success status
+        """
+        try:
+            os.makedirs(directory_path, exist_ok=True)
+            if store_type.lower() == "faiss":
+                vector_store.save_local(directory_path)
+            elif store_type.lower() == "chroma":
+                # Chroma stores are typically persisted during creation
+                pass
+            return True
+        except Exception as e:
+            print(f"Failed to save vector store: {str(e)}")
+            return False
+    def load_vector_store(self, directory_path: str, store_type: str = "faiss"):
+        """
+        Load vector store from disk
+        Args:
+            directory_path (str): Directory containing the saved store
+            store_type (str): Type of vector store
+        Returns:
+            Vector store instance or None if failed
+        """
+        try:
+            if not os.path.exists(directory_path):
+                return None
+            if store_type.lower() == "faiss":
+                vector_store = FAISS.load_local(
+                    directory_path,
+                    self.embeddings,
+                    allow_dangerous_deserialization=True
+                )
+                return vector_store
+            elif store_type.lower() == "chroma":
+                vector_store = Chroma(
+                    persist_directory=directory_path,
+                    embedding_function=self.embeddings
+                )
+                return vector_store
+            return None
+        except Exception as e:
+            print(f"Failed to load vector store: {str(e)}")
+            return None
+    def get_chunking_stats(self, documents: List[Document], strategy: str = "recursive") -> Dict[str, Any]:
+        """
+        Get detailed statistics about document chunking
+        Args:
+            documents (List[Document]): Documents to analyze
+            strategy (str): Chunking strategy to use
+        Returns:
+            Dict: Detailed chunking statistics
+        """
+        try:
+            # Chunk documents
+            chunked_docs = self.chunk_documents(documents, strategy=strategy)
+            # Calculate statistics
+            chunk_sizes = [len(doc.page_content) for doc in chunked_docs]
+            word_counts = [len(doc.page_content.split()) for doc in chunked_docs]
+            stats = {
+                'strategy_used': strategy,
+                'original_documents': len(documents),
+                'total_chunks': len(chunked_docs),
+                'chunk_size_stats': {
+                    'min': min(chunk_sizes) if chunk_sizes else 0,
+                    'max': max(chunk_sizes) if chunk_sizes else 0,
+                    'mean': np.mean(chunk_sizes) if chunk_sizes else 0,
+                    'median': np.median(chunk_sizes) if chunk_sizes else 0,
+                    'std': np.std(chunk_sizes) if chunk_sizes else 0
+                },
+                'word_count_stats': {
+                    'min': min(word_counts) if word_counts else 0,
+                    'max': max(word_counts) if word_counts else 0,
+                    'mean': np.mean(word_counts) if word_counts else 0,
+                    'median': np.median(word_counts) if word_counts else 0,
+                    'std': np.std(word_counts) if word_counts else 0
+                },
+                'chunk_distribution': {
+                    'very_small': len([s for s in chunk_sizes if s < 200]),
+                    'small': len([s for s in chunk_sizes if 200 <= s < 500]),
+                    'medium': len([s for s in chunk_sizes if 500 <= s < 1000]),
+                    'large': len([s for s in chunk_sizes if 1000 <= s < 2000]),
+                    'very_large': len([s for s in chunk_sizes if s >= 2000])
+                },
+                'overlap_efficiency': self._calculate_overlap_efficiency(chunked_docs),
+                'content_coverage': self._calculate_content_coverage(documents, chunked_docs)
+            }
+            return stats
+        except Exception as e:
+            return {'error': f"Chunking statistics failed: {str(e)}"}
+    def _calculate_overlap_efficiency(self, chunked_docs: List[Document]) -> float:
+        """Calculate efficiency of chunk overlaps"""
+        try:
+            if len(chunked_docs) < 2:
+                return 1.0
+            total_content_length = sum(len(doc.page_content) for doc in chunked_docs)
+            unique_content = set()
+            # Rough estimate of content uniqueness
+            for doc in chunked_docs:
+                words = doc.page_content.split()
+                for i in range(0, len(words), 10):  # Sample every 10th word
+                    unique_content.add(' '.join(words[i:i+10]))
+            # Efficiency as ratio of unique content to total content
+            efficiency = len(unique_content) * 10 / total_content_length if total_content_length > 0 else 0
+            return min(efficiency, 1.0)
+        except Exception:
+            return 0.5  # Default neutral efficiency
+    def _calculate_content_coverage(self, original_docs: List[Document],
+                                   chunked_docs: List[Document]) -> float:
+        """Calculate how well chunks cover original content"""
+        try:
+            original_content = ' '.join([doc.page_content for doc in original_docs])
+            chunked_content = ' '.join([doc.page_content for doc in chunked_docs])
+            # Simple coverage metric based on length
+            coverage = len(chunked_content) / len(original_content) if original_content else 0
+            return min(coverage, 1.0)
+        except Exception:
+            return 0.0
+class ChunkingOptimizer:
+    """Helper class for optimizing chunking parameters"""
+    def __init__(self, embeddings_model):
+        self.embeddings = embeddings_model
+    def optimize_chunk_size(self, documents: List[Document], test_queries: List[str],
+                           size_range: Tuple[int, int] = (200, 2000),
+                           step_size: int = 200) -> Dict[str, Any]:
+        """
+        Find optimal chunk size for given documents and queries
+        Args:
+            documents (List[Document]): Documents to test
+            test_queries (List[str]): Queries for testing retrieval
+            size_range (Tuple[int, int]): Range of chunk sizes to test
+            step_size (int): Step size for testing
+        Returns:
+            Dict: Optimization results with recommended chunk size
+        """
+        try:
+            results = {}
+            min_size, max_size = size_range
+            for chunk_size in range(min_size, max_size + 1, step_size):
+                # Test this chunk size
+                chunker = VectorChunker(self.embeddings, chunk_size=chunk_size)
+                try:
+                    chunked_docs = chunker.chunk_documents(documents)
+                    vector_store = chunker.create_vector_store(chunked_docs)
+                    # Test retrieval performance
+                    retrieval_scores = []
+                    for query in test_queries:
+                        search_results = vector_store.similarity_search_with_score(query, k=3)
+                        if search_results:
+                            avg_score = sum(score for _, score in search_results) / len(search_results)
+                            retrieval_scores.append(float(avg_score))
+                    avg_performance = np.mean(retrieval_scores) if retrieval_scores else 0
+                    results[chunk_size] = {
+                        'average_retrieval_score': avg_performance,
+                        'total_chunks': len(chunked_docs),
+                        'retrieval_scores': retrieval_scores
+                    }
+                except Exception as e:
+                    results[chunk_size] = {'error': str(e)}
+            # Find optimal chunk size
+            valid_results = {k: v for k, v in results.items() if 'error' not in v}
+            if valid_results:
+                optimal_size = max(valid_results.keys(),
+                                 key=lambda k: valid_results[k]['average_retrieval_score'])
+                return {
+                    'optimal_chunk_size': optimal_size,
+                    'optimal_performance': valid_results[optimal_size]['average_retrieval_score'],
+                    'all_results': results,
+                    'performance_trend': self._analyze_performance_trend(valid_results),
+                    'recommendation': f"Use chunk size {optimal_size} for best retrieval performance"
+                }
+            else:
+                return {
+                    'error': 'No valid chunk sizes could be tested',
+                    'all_results': results
+                }
+        except Exception as e:
+            return {'error': f"Chunk size optimization failed: {str(e)}"}
+    def _analyze_performance_trend(self, results: Dict[int, Dict[str, Any]]) -> Dict[str, Any]:
+        """Analyze performance trend across different chunk sizes"""
+        try:
+            sizes = sorted(results.keys())
+            performances = [results[size]['average_retrieval_score'] for size in sizes]
+            # Find trend direction
+            if len(performances) >= 2:
+                trend_direction = "increasing" if performances[-1] > performances[0] else "decreasing"
+                peak_performance = max(performances)
+                peak_size = sizes[performances.index(peak_performance)]
+                return {
+                    'trend_direction': trend_direction,
+                    'peak_performance': peak_performance,
+                    'peak_size': peak_size,
+                    'performance_range': max(performances) - min(performances),
+                    'stable_performance': max(performances) - min(performances) < 0.1
+                }
+            else:
+                return {'error': 'Insufficient data for trend analysis'}
+        except Exception:
+            return {'error': 'Trend analysis failed'}
+class RAGPipeline:
+    """Complete RAG pipeline for document question-answering"""
+    def __init__(self, embeddings_model, llm):
+        self.embeddings = embeddings_model
+        self.llm = llm
+        self.chunker = VectorChunker(embeddings_model)
+        self.vector_stores = {}
+        self.qa_chains = {}
+    def create_pipeline(self, documents: List[Document], pipeline_id: str,
+                       chunking_strategy: str = "semantic") -> Dict[str, Any]:
+        """
+        Create a complete RAG pipeline for documents
+        Args:
+            documents (List[Document]): Documents to process
+            pipeline_id (str): Unique identifier for this pipeline
+            chunking_strategy (str): Strategy for document chunking
+        Returns:
+            Dict: Pipeline creation results
+        """
+        try:
+            # Step 1: Chunk documents
+            chunked_docs = self.chunker.chunk_documents(documents, strategy=chunking_strategy)
+            # Step 2: Create vector store
+            vector_store = self.chunker.create_vector_store(chunked_docs, store_type="faiss")
+            # Step 3: Create QA chain
+            qa_chain = self.chunker.create_qa_chain(documents, self.llm)
+            # Store pipeline components
+            self.vector_stores[pipeline_id] = vector_store
+            self.qa_chains[pipeline_id] = qa_chain
+            # Pipeline statistics
+            stats = {
+                'pipeline_id': pipeline_id,
+                'documents_processed': len(documents),
+                'chunks_created': len(chunked_docs),
+                'chunking_strategy': chunking_strategy,
+                'vector_store_type': 'faiss',
+                'embedding_model': str(self.embeddings),
+                'created_at': self._get_timestamp()
+            }
+            return {
+                'success': True,
+                'pipeline_stats': stats,
+                'chunking_info': self.chunker.get_chunking_stats(documents, chunking_strategy)
+            }
+        except Exception as e:
+            return {'error': f"Pipeline creation failed: {str(e)}"}
+    def query_pipeline(self, pipeline_id: str, query: str,
+                      return_sources: bool = True) -> Dict[str, Any]:
+        """
+        Query a created RAG pipeline
+        Args:
+            pipeline_id (str): ID of the pipeline to query
+            query (str): Question to ask
+            return_sources (bool): Whether to return source documents
+        Returns:
+            Dict: Query results with answer and sources
+        """
+        try:
+            if pipeline_id not in self.qa_chains:
+                return {'error': f"Pipeline '{pipeline_id}' not found"}
+            qa_chain = self.qa_chains[pipeline_id]
+            # Execute query
+            result = qa_chain({"query": query})
+            # Format response
+            response = {
+                'query': query,
+                'answer': result.get('result', 'No answer generated'),
+                'pipeline_id': pipeline_id,
+                'query_timestamp': self._get_timestamp()
+            }
+            # Add source documents if requested
+            if return_sources and 'source_documents' in result:
+                sources = []
+                for i, doc in enumerate(result['source_documents']):
+                    source = {
+                        'source_index': i,
+                        'content': doc.page_content,
+                        'metadata': doc.metadata,
+                        'relevance_rank': i + 1
+                    }
+                    sources.append(source)
+                response['sources'] = sources
+                response['num_sources'] = len(sources)
+            return response
+        except Exception as e:
+            return {'error': f"Pipeline query failed: {str(e)}"}
+    def batch_query_pipeline(self, pipeline_id: str, queries: List[str]) -> List[Dict[str, Any]]:
+        """
+        Execute multiple queries on a pipeline
+        Args:
+            pipeline_id (str): ID of the pipeline to query
+            queries (List[str]): List of questions to ask
+        Returns:
+            List[Dict]: List of query results
+        """
+        results = []
+        for i, query in enumerate(queries):
+            try:
+                result = self.query_pipeline(pipeline_id, query, return_sources=False)
+                result['batch_index'] = i
+                results.append(result)
+            except Exception as e:
+                results.append({
+                    'batch_index': i,
+                    'query': query,
+                    'error': f"Batch query failed: {str(e)}"
+                })
+        return results
+    def evaluate_pipeline(self, pipeline_id: str, test_queries: List[str],
+                         expected_answers: List[str] = None) -> Dict[str, Any]:
+        """
+        Evaluate pipeline performance on test queries
+        Args:
+            pipeline_id (str): ID of the pipeline to evaluate
+            test_queries (List[str]): Test questions
+            expected_answers (List[str]): Optional expected answers for comparison
+        Returns:
+            Dict: Evaluation results
+        """
+        try:
+            if pipeline_id not in self.qa_chains:
+                return {'error': f"Pipeline '{pipeline_id}' not found"}
+            evaluation_results = []
+            response_times = []
+            for i, query in enumerate(test_queries):
+                import time
+                start_time = time.time()
+                # Execute query
+                result = self.query_pipeline(pipeline_id, query, return_sources=True)
+                end_time = time.time()
+                response_time = end_time - start_time
+                response_times.append(response_time)
+                # Evaluate result
+                eval_result = {
+                    'query_index': i,
+                    'query': query,
+                    'answer_generated': not result.get('error'),
+                    'response_time': response_time,
+                    'answer_length': len(result.get('answer', '')),
+                    'sources_returned': result.get('num_sources', 0)
+                }
+                # If expected answer provided, calculate similarity
+                if expected_answers and i < len(expected_answers):
+                    expected = expected_answers[i]
+                    generated = result.get('answer', '')
+                    # Simple similarity metric
+                    similarity = self._calculate_answer_similarity(expected, generated)
+                    eval_result['answer_similarity'] = similarity
+                    eval_result['expected_answer'] = expected
+                evaluation_results.append(eval_result)
+            # Calculate aggregate metrics
+            successful_queries = len([r for r in evaluation_results if r['answer_generated']])
+            avg_response_time = np.mean(response_times) if response_times else 0
+            if expected_answers:
+                similarities = [r.get('answer_similarity', 0) for r in evaluation_results
+                               if 'answer_similarity' in r]
+                avg_similarity = np.mean(similarities) if similarities else 0
+            else:
+                avg_similarity = None
+            return {
+                'pipeline_id': pipeline_id,
+                'total_queries': len(test_queries),
+                'successful_queries': successful_queries,
+                'success_rate': successful_queries / len(test_queries) if test_queries else 0,
+                'average_response_time': avg_response_time,
+                'average_answer_similarity': avg_similarity,
+                'detailed_results': evaluation_results,
+                'evaluation_timestamp': self._get_timestamp()
+            }
+        except Exception as e:
+            return {'error': f"Pipeline evaluation failed: {str(e)}"}
+    def _calculate_answer_similarity(self, expected: str, generated: str) -> float:
+        """Calculate similarity between expected and generated answers"""
+        try:
+            # Simple word overlap similarity
+            expected_words = set(expected.lower().split())
+            generated_words = set(generated.lower().split())
+            if not expected_words and not generated_words:
+                return 1.0
+            intersection = expected_words.intersection(generated_words)
+            union = expected_words.union(generated_words)
+            return len(intersection) / len(union) if union else 0.0
+        except Exception:
+            return 0.0
+    def get_pipeline_info(self, pipeline_id: str) -> Dict[str, Any]:
+        """Get information about a specific pipeline"""
+        try:
+            if pipeline_id not in self.qa_chains:
+                return {'error': f"Pipeline '{pipeline_id}' not found"}
+            # Get vector store info
+            vector_store = self.vector_stores.get(pipeline_id)
+            if vector_store:
+                try:
+                    # Try to get vector store statistics
+                    total_vectors = vector_store.index.ntotal if hasattr(vector_store, 'index') else 'unknown'
+                except:
+                    total_vectors = 'unknown'
+            else:
+                total_vectors = 'unknown'
+            return {
+                'pipeline_id': pipeline_id,
+                'has_qa_chain': pipeline_id in self.qa_chains,
+                'has_vector_store': pipeline_id in self.vector_stores,
+                'total_vectors': total_vectors,
+                'embedding_model': str(self.embeddings),
+                'llm_model': str(self.llm)
+            }
+        except Exception as e:
+            return {'error': f"Failed to get pipeline info: {str(e)}"}
+    def list_pipelines(self) -> Dict[str, Any]:
+        """List all created pipelines"""
+        return {
+            'total_pipelines': len(self.qa_chains),
+            'pipeline_ids': list(self.qa_chains.keys()),
+            'vector_stores': list(self.vector_stores.keys())
+        }
+    def delete_pipeline(self, pipeline_id: str) -> Dict[str, Any]:
+        """Delete a pipeline and free resources"""
+        try:
+            deleted_components = []
+            if pipeline_id in self.qa_chains:
+                del self.qa_chains[pipeline_id]
+                deleted_components.append('qa_chain')
+            if pipeline_id in self.vector_stores:
+                del self.vector_stores[pipeline_id]
+                deleted_components.append('vector_store')
+            if deleted_components:
+                return {
+                    'success': True,
+                    'pipeline_id': pipeline_id,
+                    'deleted_components': deleted_components
+                }
+            else:
+                return {'error': f"Pipeline '{pipeline_id}' not found"}
+        except Exception as e:
+            return {'error': f"Pipeline deletion failed: {str(e)}"}
+    def export_pipeline_config(self, pipeline_id: str) -> Dict[str, Any]:
+        """Export pipeline configuration for recreation"""
+        try:
+            if pipeline_id not in self.qa_chains:
+                return {'error': f"Pipeline '{pipeline_id}' not found"}
+            config = {
+                'pipeline_id': pipeline_id,
+                'embedding_model_name': getattr(self.embeddings, 'model_name', 'unknown'),
+                'llm_model_name': getattr(self.llm, 'model_name', 'unknown'),
+                'chunker_config': {
+                    'chunk_size': self.chunker.chunk_size,
+                    'chunk_overlap': self.chunker.chunk_overlap
+                },
+                'export_timestamp': self._get_timestamp(),
+                'vector_store_type': 'faiss'
+            }
+            return config
+        except Exception as e:
+            return {'error': f"Pipeline export failed: {str(e)}"}
+    def _get_timestamp(self) -> str:
+        """Get current timestamp"""
+        from datetime import datetime
+        return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+# Utility functions for the module
+def optimize_rag_pipeline(documents: List[Document], embeddings_model, llm,
+                         test_queries: List[str]) -> Dict[str, Any]:
+    """
+    Optimize RAG pipeline configuration for given documents and queries
+    Args:
+        documents (List[Document]): Documents to optimize for
+        embeddings_model: Embedding model to use
+        llm: Language model to use
+        test_queries (List[str]): Test queries for optimization
+    Returns:
+        Dict: Optimization recommendations
+    """
+    try:
+        # Test different chunking strategies
+        chunker = VectorChunker(embeddings_model)
+        chunking_results = chunker.optimize_chunking_strategy(documents, test_queries)
+        # Test different chunk sizes
+        optimizer = ChunkingOptimizer(embeddings_model)
+        size_results = optimizer.optimize_chunk_size(documents, test_queries)
+        # Create optimized pipeline
+        best_strategy = chunking_results.get('recommended_strategy', 'semantic')
+        best_size = size_results.get('optimal_chunk_size', 1000)
+        # Create optimized chunker
+        optimized_chunker = VectorChunker(
+            embeddings_model,
+            chunk_size=best_size,
+            chunk_overlap=best_size // 5  # 20% overlap
+        )
+        # Test the optimized configuration
+        pipeline = RAGPipeline(embeddings_model, llm)
+        pipeline.chunker = optimized_chunker
+        test_pipeline_id = "optimization_test"
+        creation_result = pipeline.create_pipeline(documents, test_pipeline_id, best_strategy)
+        if not creation_result.get('error'):
+            evaluation_result = pipeline.evaluate_pipeline(test_pipeline_id, test_queries)
+            pipeline.delete_pipeline(test_pipeline_id)  # Clean up
+        else:
+            evaluation_result = {'error': 'Could not evaluate optimized pipeline'}
+        return {
+            'optimization_complete': True,
+            'recommended_config': {
+                'chunking_strategy': best_strategy,
+                'chunk_size': best_size,
+                'chunk_overlap': best_size // 5
+            },
+            'chunking_optimization': chunking_results,
+            'size_optimization': size_results,
+            'performance_evaluation': evaluation_result,
+            'recommendations': [
+                f"Use {best_strategy} chunking strategy",
+                f"Set chunk size to {best_size} characters",
+                f"Use {best_size // 5} character overlap",
+                "Monitor and adjust based on query performance"
+            ]
+        }
+    except Exception as e:
+        return {'error': f"RAG optimization failed: {str(e)}"}
+def create_demo_rag_system(sample_documents: List[Document], embeddings_model, llm) -> Dict[str, Any]:
+    """
+    Create a demonstration RAG system with sample documents
+    Args:
+        sample_documents (List[Document]): Sample documents for demo
+        embeddings_model: Embedding model
+        llm: Language model
+    Returns:
+        Dict: Demo system information and sample interactions
+    """
+    try:
+        # Create RAG pipeline
+        pipeline = RAGPipeline(embeddings_model, llm)
+        demo_id = "demo_system"
+        # Create the pipeline
+        creation_result = pipeline.create_pipeline(sample_documents, demo_id, "semantic")
+        if creation_result.get('error'):
+            return {'error': f"Demo system creation failed: {creation_result['error']}"}
+        # Sample queries for demonstration
+        demo_queries = [
+            "What is the main topic of these documents?",
+            "Can you summarize the key points?",
+            "What are the most important concepts mentioned?"
+        ]
+        # Execute demo queries
+        demo_results = []
+        for query in demo_queries:
+            result = pipeline.query_pipeline(demo_id, query, return_sources=True)
+            demo_results.append(result)
+        # Get system statistics
+        pipeline_info = pipeline.get_pipeline_info(demo_id)
+        return {
+            'demo_system_created': True,
+            'pipeline_id': demo_id,
+            'creation_stats': creation_result,
+            'pipeline_info': pipeline_info,
+            'demo_queries': demo_queries,
+            'demo_results': demo_results,
+            'usage_instructions': [
+                f"Use pipeline.query_pipeline('{demo_id}', 'your question') to ask questions",
+                "The system will return answers with source document references",
+                "Sources show which parts of the documents were used for the answer"
+            ]
+        }
+    except Exception as e:
+        return {'error': f"Demo system creation failed: {str(e)}"}
+# Export the main classes for use in other modules
+__all__ = [
+    'VectorChunker',
+    'ChunkingOptimizer',
+    'RAGPipeline',
+    'optimize_rag_pipeline',
+    'create_demo_rag_system'
+]

utils/export.py ADDED Viewed

	@@ -0,0 +1,1896 @@

+"""
+Results Export and Reporting Module
+Handles export of analysis results, reports, and data for external use
+"""
+import json
+import csv
+import io
+import zipfile
+import tempfile
+import os
+from datetime import datetime
+from typing import Dict, Any, List, Optional, Union
+import pandas as pd
+from dataclasses import dataclass, asdict
+@dataclass
+class GEOReport:
+    """Data class for GEO analysis reports"""
+    website_url: str
+    analysis_date: str
+    overall_score: float
+    pages_analyzed: int
+    geo_scores: Dict[str, float]
+    recommendations: List[str]
+    optimization_opportunities: List[Dict[str, Any]]
+    competitive_position: str
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert report to dictionary"""
+        return asdict(self)
+@dataclass
+class ContentAnalysis:
+    """Data class for content optimization analysis"""
+    original_content: str
+    analysis_date: str
+    clarity_score: float
+    structure_score: float
+    answerability_score: float
+    keywords: List[str]
+    optimized_content: Optional[str]
+    improvements_made: List[str]
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert analysis to dictionary"""
+        return asdict(self)
+class ResultExporter:
+    """Main class for exporting analysis results and generating reports"""
+    def __init__(self):
+        self.export_formats = ['json', 'csv', 'html', 'pdf', 'xlsx']
+        self.supported_types = ['geo_analysis', 'content_optimization', 'qa_results', 'batch_analysis']
+    def export_geo_results(self, geo_results: List[Dict[str, Any]],
+                          website_url: str, format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
+        """
+        Export GEO analysis results in specified format
+        Args:
+            geo_results (List[Dict]): List of GEO analysis results
+            website_url (str): URL of analyzed website
+            format_type (str): Export format ('json', 'csv', 'html', 'xlsx')
+        Returns:
+            Union[str, bytes, Dict]: Exported data in requested format
+        """
+        try:
+            # Prepare consolidated data
+            export_data = self._prepare_geo_export_data(geo_results, website_url)
+            if format_type.lower() == 'json':
+                return self._export_geo_json(export_data)
+            elif format_type.lower() == 'csv':
+                return self._export_geo_csv(export_data)
+            elif format_type.lower() == 'html':
+                return self._export_geo_html(export_data)
+            elif format_type.lower() == 'xlsx':
+                return self._export_geo_excel(export_data)
+            elif format_type.lower() == 'pdf':
+                return self._export_geo_pdf(export_data)
+            else:
+                raise ValueError(f"Unsupported export format: {format_type}")
+        except Exception as e:
+            return {'error': f"Export failed: {str(e)}"}
+    def export_enhancement_results(self, enhancement_result: Dict[str, Any],
+                                  format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
+        """
+        Export content enhancement results
+        Args:
+            enhancement_result (Dict): Content enhancement analysis result
+            format_type (str): Export format
+        Returns:
+            Union[str, bytes, Dict]: Exported data
+        """
+        try:
+            # Prepare data for export
+            export_data = self._prepare_enhancement_export_data(enhancement_result)
+            if format_type.lower() == 'json':
+                return json.dumps(export_data, indent=2, ensure_ascii=False)
+            elif format_type.lower() == 'html':
+                return self._export_enhancement_html(export_data)
+            elif format_type.lower() == 'csv':
+                return self._export_enhancement_csv(export_data)
+            else:
+                return json.dumps(export_data, indent=2, ensure_ascii=False)
+        except Exception as e:
+            return {'error': f"Enhancement export failed: {str(e)}"}
+    def export_qa_results(self, qa_results: List[Dict[str, Any]],
+                         format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
+        """
+        Export Q&A session results
+        Args:
+            qa_results (List[Dict]): List of Q&A interactions
+            format_type (str): Export format
+        Returns:
+            Union[str, bytes, Dict]: Exported data
+        """
+        try:
+            export_data = {
+                'qa_session': {
+                    'session_date': datetime.now().isoformat(),
+                    'total_questions': len(qa_results),
+                    'interactions': qa_results
+                },
+                'summary': {
+                    'successful_answers': len([r for r in qa_results if not r.get('error')]),
+                    'average_response_length': self._calculate_avg_response_length(qa_results),
+                    'most_common_topics': self._extract_common_topics(qa_results)
+                }
+            }
+            if format_type.lower() == 'json':
+                return json.dumps(export_data, indent=2, ensure_ascii=False)
+            elif format_type.lower() == 'html':
+                return self._export_qa_html(export_data)
+            elif format_type.lower() == 'csv':
+                return self._export_qa_csv(export_data)
+            else:
+                return json.dumps(export_data, indent=2, ensure_ascii=False)
+        except Exception as e:
+            return {'error': f"Q&A export failed: {str(e)}"}
+    def create_comprehensive_report(self, analysis_data: Dict[str, Any],
+                                   report_type: str = 'full') -> Dict[str, Any]:
+        """
+        Create comprehensive analysis report
+        Args:
+            analysis_data (Dict): Combined analysis data from multiple sources
+            report_type (str): Type of report ('full', 'summary', 'executive')
+        Returns:
+            Dict: Comprehensive report data
+        """
+        try:
+            report = {
+                'report_metadata': {
+                    'generated_at': datetime.now().isoformat(),
+                    'report_type': report_type,
+                    'generator': 'GEO SEO AI Optimizer',
+                    'version': '1.0'
+                }
+            }
+            if report_type == 'executive':
+                report.update(self._create_executive_summary(analysis_data))
+            elif report_type == 'summary':
+                report.update(self._create_summary_report(analysis_data))
+            else:  # full report
+                report.update(self._create_full_report(analysis_data))
+            return report
+        except Exception as e:
+            return {'error': f"Report creation failed: {str(e)}"}
+    def export_batch_results(self, batch_results: List[Dict[str, Any]],
+                           batch_metadata: Dict[str, Any],
+                           format_type: str = 'xlsx') -> Union[str, bytes, Dict[str, Any]]:
+        """
+        Export batch analysis results
+        Args:
+            batch_results (List[Dict]): List of batch analysis results
+            batch_metadata (Dict): Metadata about the batch process
+            format_type (str): Export format
+        Returns:
+            Union[str, bytes, Dict]: Exported batch data
+        """
+        try:
+            export_data = {
+                'batch_metadata': batch_metadata,
+                'batch_results': batch_results,
+                'batch_summary': self._create_batch_summary(batch_results),
+                'export_timestamp': datetime.now().isoformat()
+            }
+            if format_type.lower() == 'xlsx':
+                return self._export_batch_excel(export_data)
+            elif format_type.lower() == 'json':
+                return json.dumps(export_data, indent=2, ensure_ascii=False)
+            elif format_type.lower() == 'csv':
+                return self._export_batch_csv(export_data)
+            else:
+                return json.dumps(export_data, indent=2, ensure_ascii=False)
+        except Exception as e:
+            return {'error': f"Batch export failed: {str(e)}"}
+    def create_export_package(self, analysis_data: Dict[str, Any],
+                             package_name: str = "geo_analysis") -> bytes:
+        """
+        Create a ZIP package with multiple export formats
+        Args:
+            analysis_data (Dict): Analysis data to package
+            package_name (str): Name for the package
+        Returns:
+            bytes: ZIP file content
+        """
+        try:
+            # Create temporary directory
+            with tempfile.TemporaryDirectory() as temp_dir:
+                zip_path = os.path.join(temp_dir, f"{package_name}.zip")
+                with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+                    # Add JSON export
+                    json_data = json.dumps(analysis_data, indent=2, ensure_ascii=False)
+                    zip_file.writestr(f"{package_name}.json", json_data)
+                    # Add HTML report
+                    if 'geo_results' in analysis_data:
+                        html_data = self._export_geo_html(analysis_data)
+                        zip_file.writestr(f"{package_name}_report.html", html_data)
+                    # Add CSV data
+                    if 'geo_results' in analysis_data:
+                        csv_data = self._export_geo_csv(analysis_data)
+                        zip_file.writestr(f"{package_name}_data.csv", csv_data)
+                    # Add README
+                    readme_content = self._generate_package_readme(analysis_data)
+                    zip_file.writestr("README.txt", readme_content)
+                # Read the ZIP file
+                with open(zip_path, 'rb') as zip_file:
+                    return zip_file.read()
+        except Exception as e:
+            raise Exception(f"Package creation failed: {str(e)}")
+    def _prepare_geo_export_data(self, geo_results: List[Dict[str, Any]], website_url: str) -> Dict[str, Any]:
+        """Prepare GEO data for export"""
+        try:
+            # Calculate aggregate metrics
+            valid_results = [r for r in geo_results if 'geo_scores' in r and not r.get('error')]
+            if not valid_results:
+                return {
+                    'error': 'No valid GEO results to export',
+                    'website_url': website_url,
+                    'export_timestamp': datetime.now().isoformat()
+                }
+            # Aggregate scores
+            all_scores = {}
+            for result in valid_results:
+                for metric, score in result.get('geo_scores', {}).items():
+                    if metric not in all_scores:
+                        all_scores[metric] = []
+                    all_scores[metric].append(score)
+            avg_scores = {metric: sum(scores) / len(scores) for metric, scores in all_scores.items()}
+            overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
+            # Collect recommendations
+            all_recommendations = []
+            all_opportunities = []
+            for result in valid_results:
+                all_recommendations.extend(result.get('recommendations', []))
+                all_opportunities.extend(result.get('optimization_opportunities', []))
+            # Remove duplicates
+            unique_recommendations = list(set(all_recommendations))
+            return {
+                'website_analysis': {
+                    'url': website_url,
+                    'analysis_date': datetime.now().isoformat(),
+                    'pages_analyzed': len(valid_results),
+                    'overall_geo_score': round(overall_avg, 2)
+                },
+                'aggregate_scores': avg_scores,
+                'individual_page_results': valid_results,
+                'recommendations': unique_recommendations[:10],  # Top 10
+                'optimization_opportunities': all_opportunities,
+                'performance_insights': self._generate_performance_insights(avg_scores, overall_avg),
+                'export_metadata': {
+                    'exported_by': 'GEO SEO AI Optimizer',
+                    'export_timestamp': datetime.now().isoformat(),
+                    'data_format': 'GEO Analysis Results v1.0'
+                }
+            }
+        except Exception as e:
+            return {'error': f"Data preparation failed: {str(e)}"}
+    def _prepare_enhancement_export_data(self, enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
+        """Prepare content enhancement data for export"""
+        try:
+            scores = enhancement_result.get('scores', {})
+            return {
+                'content_analysis': {
+                    'analysis_date': datetime.now().isoformat(),
+                    'original_content_length': enhancement_result.get('original_length', 0),
+                    'original_word_count': enhancement_result.get('original_word_count', 0),
+                    'analysis_type': enhancement_result.get('optimization_type', 'standard')
+                },
+                'performance_scores': {
+                    'clarity': scores.get('clarity', 0),
+                    'structure': scores.get('structuredness', 0),
+                    'answerability': scores.get('answerability', 0),
+                    'overall_average': sum(scores.values()) / len(scores) if scores else 0
+                },
+                'optimization_results': {
+                    'keywords_identified': enhancement_result.get('keywords', []),
+                    'optimized_content': enhancement_result.get('optimized_text', ''),
+                    'improvements_made': enhancement_result.get('optimization_suggestions', []),
+                    'analyze_only': enhancement_result.get('analyze_only', False)
+                },
+                'export_metadata': {
+                    'exported_by': 'GEO SEO AI Optimizer',
+                    'export_timestamp': datetime.now().isoformat(),
+                    'data_format': 'Content Enhancement Results v1.0'
+                }
+            }
+        except Exception as e:
+            return {'error': f"Enhancement data preparation failed: {str(e)}"}
+    def _export_geo_json(self, data: Dict[str, Any]) -> str:
+        """Export GEO data as JSON"""
+        return json.dumps(data, indent=2, ensure_ascii=False)
+    def _export_geo_csv(self, data: Dict[str, Any]) -> str:
+        """Export GEO data as CSV"""
+        try:
+            output = io.StringIO()
+            # Write aggregate scores
+            writer = csv.writer(output)
+            writer.writerow(['GEO Analysis Results'])
+            writer.writerow(['Website:', data.get('website_analysis', {}).get('url', 'Unknown')])
+            writer.writerow(['Analysis Date:', data.get('website_analysis', {}).get('analysis_date', 'Unknown')])
+            writer.writerow(['Overall Score:', data.get('website_analysis', {}).get('overall_geo_score', 0)])
+            writer.writerow([])
+            # Write aggregate scores
+            writer.writerow(['Metric', 'Score'])
+            for metric, score in data.get('aggregate_scores', {}).items():
+                writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
+            writer.writerow([])
+            writer.writerow(['Recommendations'])
+            for i, rec in enumerate(data.get('recommendations', []), 1):
+                writer.writerow([f"{i}.", rec])
+            # Individual page results
+            if data.get('individual_page_results'):
+                writer.writerow([])
+                writer.writerow(['Individual Page Results'])
+                # Header for page results
+                first_result = data['individual_page_results'][0]
+                if 'geo_scores' in first_result:
+                    headers = ['Page Index', 'Page URL', 'Page Title'] + list(first_result['geo_scores'].keys())
+                    writer.writerow(headers)
+                    for i, result in enumerate(data['individual_page_results']):
+                        page_data = result.get('page_data', {})
+                        scores = result.get('geo_scores', {})
+                        row = [
+                            i + 1,
+                            page_data.get('url', 'Unknown'),
+                            page_data.get('title', 'Unknown')
+                        ] + [round(scores.get(metric, 0), 2) for metric in headers[3:]]
+                        writer.writerow(row)
+            return output.getvalue()
+        except Exception as e:
+            return f"CSV export error: {str(e)}"
+    def _export_geo_html(self, data: Dict[str, Any]) -> str:
+        """Export GEO data as HTML report"""
+        try:
+            website_info = data.get('website_analysis', {})
+            scores = data.get('aggregate_scores', {})
+            recommendations = data.get('recommendations', [])
+            html_content = f"""
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>GEO Analysis Report - {website_info.get('url', 'Website')}</title>
+    <style>
+        body {{
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 1200px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f5f5f5;
+        }}
+        .header {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 30px;
+            border-radius: 10px;
+            margin-bottom: 30px;
+            text-align: center;
+        }}
+        .header h1 {{
+            margin: 0;
+            font-size: 2.5em;
+        }}
+        .summary-cards {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+            gap: 20px;
+            margin-bottom: 30px;
+        }}
+        .card {{
+            background: white;
+            padding: 20px;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+            text-align: center;
+        }}
+        .card h3 {{
+            margin-top: 0;
+            color: #667eea;
+        }}
+        .score {{
+            font-size: 2em;
+            font-weight: bold;
+            color: #333;
+        }}
+        .scores-grid {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
+            gap: 20px;
+            margin-bottom: 30px;
+        }}
+        .score-item {{
+            background: white;
+            padding: 15px;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+        }}
+        .score-bar {{
+            width: 100px;
+            height: 10px;
+            background: #e0e0e0;
+            border-radius: 5px;
+            overflow: hidden;
+        }}
+        .score-fill {{
+            height: 100%;
+            background: linear-gradient(90deg, #ff6b6b, #ffa500, #4ecdc4);
+            transition: width 0.3s ease;
+        }}
+        .recommendations {{
+            background: white;
+            padding: 30px;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+            margin-bottom: 30px;
+        }}
+        .recommendations h2 {{
+            color: #667eea;
+            border-bottom: 2px solid #667eea;
+            padding-bottom: 10px;
+        }}
+        .rec-item {{
+            padding: 10px 0;
+            border-bottom: 1px solid #eee;
+        }}
+        .footer {{
+            text-align: center;
+            color: #666;
+            margin-top: 40px;
+            padding-top: 20px;
+            border-top: 1px solid #ddd;
+        }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>🚀 GEO Analysis Report</h1>
+        <p>Generative Engine Optimization Performance Analysis</p>
+        <p><strong>Website:</strong> {website_info.get('url', 'Not specified')}</p>
+        <p><strong>Analysis Date:</strong> {website_info.get('analysis_date', 'Not specified')}</p>
+    </div>
+    <div class="summary-cards">
+        <div class="card">
+            <h3>Overall GEO Score</h3>
+            <div class="score">{website_info.get('overall_geo_score', 0)}/10</div>
+        </div>
+        <div class="card">
+            <h3>Pages Analyzed</h3>
+            <div class="score">{website_info.get('pages_analyzed', 0)}</div>
+        </div>
+        <div class="card">
+            <h3>Recommendations</h3>
+            <div class="score">{len(recommendations)}</div>
+        </div>
+    </div>
+    <h2>📊 Detailed GEO Metrics</h2>
+    <div class="scores-grid">
+    """
+            # Add individual scores
+            for metric, score in scores.items():
+                metric_display = metric.replace('_', ' ').title()
+                score_percentage = min(score * 10, 100)  # Convert to percentage
+                html_content += f"""
+        <div class="score-item">
+            <div>
+                <strong>{metric_display}</strong><br>
+                <span style="color: #666;">{score:.1f}/10</span>
+            </div>
+            <div class="score-bar">
+                <div class="score-fill" style="width: {score_percentage}%;"></div>
+            </div>
+        </div>
+                """
+            html_content += """
+    </div>
+    <div class="recommendations">
+        <h2>💡 Optimization Recommendations</h2>
+    """
+            # Add recommendations
+            for i, rec in enumerate(recommendations, 1):
+                html_content += f'<div class="rec-item"><strong>{i}.</strong> {rec}</div>'
+            html_content += f"""
+    </div>
+    <div class="footer">
+        <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+        <p>This report provides AI-first SEO optimization insights for better generative engine performance.</p>
+    </div>
+</body>
+</html>
+            """
+            return html_content
+        except Exception as e:
+            return f"<html><body><h1>HTML Export Error</h1><p>{str(e)}</p></body></html>"
+    def _export_geo_excel(self, data: Dict[str, Any]) -> bytes:
+        """Export GEO data as Excel file"""
+        try:
+            output = io.BytesIO()
+            with pd.ExcelWriter(output, engine='openpyxl') as writer:
+                # Summary sheet
+                summary_data = {
+                    'Metric': ['Website URL', 'Analysis Date', 'Pages Analyzed', 'Overall Score'],
+                    'Value': [
+                        data.get('website_analysis', {}).get('url', 'Unknown'),
+                        data.get('website_analysis', {}).get('analysis_date', 'Unknown'),
+                        data.get('website_analysis', {}).get('pages_analyzed', 0),
+                        data.get('website_analysis', {}).get('overall_geo_score', 0)
+                    ]
+                }
+                pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
+                # Scores sheet
+                scores_data = []
+                for metric, score in data.get('aggregate_scores', {}).items():
+                    scores_data.append({
+                        'Metric': metric.replace('_', ' ').title(),
+                        'Score': round(score, 2),
+                        'Performance': self._get_performance_level(score)
+                    })
+                pd.DataFrame(scores_data).to_excel(writer, sheet_name='GEO Scores', index=False)
+                # Recommendations sheet
+                rec_data = []
+                for i, rec in enumerate(data.get('recommendations', []), 1):
+                    rec_data.append({
+                        'Priority': i,
+                        'Recommendation': rec,
+                        'Category': self._categorize_recommendation(rec)
+                    })
+                if rec_data:
+                    pd.DataFrame(rec_data).to_excel(writer, sheet_name='Recommendations', index=False)
+                # Individual pages sheet
+                if data.get('individual_page_results'):
+                    pages_data = []
+                    for i, result in enumerate(data['individual_page_results']):
+                        page_data = result.get('page_data', {})
+                        scores = result.get('geo_scores', {})
+                        page_row = {
+                            'Page_Index': i + 1,
+                            'URL': page_data.get('url', 'Unknown'),
+                            'Title': page_data.get('title', 'Unknown'),
+                            'Word_Count': page_data.get('word_count', 0)
+                        }
+                        # Add all GEO scores
+                        for metric, score in scores.items():
+                            page_row[metric.replace('_', ' ').title()] = round(score, 2)
+                        pages_data.append(page_row)
+                    pd.DataFrame(pages_data).to_excel(writer, sheet_name='Individual Pages', index=False)
+            output.seek(0)
+            return output.getvalue()
+        except Exception as e:
+            # Return error as text file if Excel creation fails
+            error_content = f"Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
+            return error_content.encode('utf-8')
+    def _export_enhancement_html(self, data: Dict[str, Any]) -> str:
+        """Export content enhancement results as HTML"""
+        try:
+            analysis = data.get('content_analysis', {})
+            scores = data.get('performance_scores', {})
+            optimization = data.get('optimization_results', {})
+            html_content = f"""
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Content Enhancement Report</title>
+    <style>
+        body {{
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f8f9fa;
+        }}
+        .header {{
+            background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
+            color: white;
+            padding: 30px;
+            border-radius: 10px;
+            margin-bottom: 30px;
+            text-align: center;
+        }}
+        .scores {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 20px;
+            margin-bottom: 30px;
+        }}
+        .score-card {{
+            background: white;
+            padding: 20px;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+            text-align: center;
+        }}
+        .content-section {{
+            background: white;
+            padding: 30px;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+            margin-bottom: 20px;
+        }}
+        .keywords {{
+            display: flex;
+            flex-wrap: wrap;
+            gap: 10px;
+            margin-top: 15px;
+        }}
+        .keyword {{
+            background: #e9ecef;
+            padding: 5px 10px;
+            border-radius: 20px;
+            font-size: 0.9em;
+        }}
+        .optimized-content {{
+            background: #f8f9fa;
+            padding: 20px;
+            border-left: 4px solid #28a745;
+            border-radius: 5px;
+            font-style: italic;
+        }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>🔧 Content Enhancement Report</h1>
+        <p>AI-Optimized Content Analysis Results</p>
+        <p><strong>Analysis Date:</strong> {analysis.get('analysis_date', 'Unknown')}</p>
+    </div>
+    <div class="scores">
+        <div class="score-card">
+            <h3>Clarity Score</h3>
+            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
+                {scores.get('clarity', 0):.1f}/10
+            </div>
+        </div>
+        <div class="score-card">
+            <h3>Structure Score</h3>
+            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
+                {scores.get('structure', 0):.1f}/10
+            </div>
+        </div>
+        <div class="score-card">
+            <h3>Answerability Score</h3>
+            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
+                {scores.get('answerability', 0):.1f}/10
+            </div>
+        </div>
+        <div class="score-card">
+            <h3>Overall Average</h3>
+            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
+                {scores.get('overall_average', 0):.1f}/10
+            </div>
+        </div>
+    </div>
+    <div class="content-section">
+        <h2>🔑 Identified Keywords</h2>
+        <div class="keywords">
+            {' '.join([f'<span class="keyword">{keyword}</span>' for keyword in optimization.get('keywords_identified', [])])}
+        </div>
+    </div>
+    {'<div class="content-section"><h2>✨ Optimized Content</h2><div class="optimized-content">' + optimization.get('optimized_content', '') + '</div></div>' if optimization.get('optimized_content') and not optimization.get('analyze_only') else ''}
+    <div class="content-section">
+        <h2>💡 Improvements Made</h2>
+        <ul>
+            {' '.join([f'<li>{improvement}</li>' for improvement in optimization.get('improvements_made', [])])}
+        </ul>
+    </div>
+    <div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
+        <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+    </div>
+</body>
+</html>
+            """
+            return html_content
+        except Exception as e:
+            return f"<html><body><h1>Enhancement HTML Export Error</h1><p>{str(e)}</p></body></html>"
+    def _export_enhancement_csv(self, data: Dict[str, Any]) -> str:
+        """Export content enhancement results as CSV"""
+        try:
+            output = io.StringIO()
+            writer = csv.writer(output)
+            # Header information
+            analysis = data.get('content_analysis', {})
+            scores = data.get('performance_scores', {})
+            optimization = data.get('optimization_results', {})
+            writer.writerow(['Content Enhancement Analysis Report'])
+            writer.writerow(['Analysis Date:', analysis.get('analysis_date', 'Unknown')])
+            writer.writerow(['Original Content Length:', analysis.get('original_content_length', 0)])
+            writer.writerow(['Original Word Count:', analysis.get('original_word_count', 0)])
+            writer.writerow([])
+            # Performance scores
+            writer.writerow(['Performance Scores'])
+            writer.writerow(['Metric', 'Score'])
+            for metric, score in scores.items():
+                writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
+            writer.writerow([])
+            writer.writerow(['Keywords Identified'])
+            for keyword in optimization.get('keywords_identified', []):
+                writer.writerow([keyword])
+            writer.writerow([])
+            writer.writerow(['Improvements Made'])
+            for improvement in optimization.get('improvements_made', []):
+                writer.writerow([improvement])
+            return output.getvalue()
+        except Exception as e:
+            return f"Enhancement CSV export error: {str(e)}"
+    def _export_qa_html(self, data: Dict[str, Any]) -> str:
+        """Export Q&A results as HTML"""
+        try:
+            session = data.get('qa_session', {})
+            summary = data.get('summary', {})
+            interactions = session.get('interactions', [])
+            html_content = f"""
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Q&A Session Report</title>
+    <style>
+        body {{
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            line-height: 1.6;
+            color: #333;
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 20px;
+            background-color: #f8f9fa;
+        }}
+        .header {{
+            background: linear-gradient(135deg, #6f42c1 0%, #e83e8c 100%);
+            color: white;
+            padding: 30px;
+            border-radius: 10px;
+            margin-bottom: 30px;
+            text-align: center;
+        }}
+        .summary {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 20px;
+            margin-bottom: 30px;
+        }}
+        .summary-card {{
+            background: white;
+            padding: 20px;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+            text-align: center;
+        }}
+        .qa-item {{
+            background: white;
+            padding: 20px;
+            border-radius: 10px;
+            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+            margin-bottom: 20px;
+        }}
+        .question {{
+            background: #e9ecef;
+            padding: 15px;
+            border-left: 4px solid #6f42c1;
+            border-radius: 5px;
+            margin-bottom: 15px;
+        }}
+        .answer {{
+            padding: 15px;
+            border-left: 4px solid #28a745;
+            border-radius: 5px;
+            background: #f8f9fa;
+        }}
+        .sources {{
+            margin-top: 15px;
+            padding: 10px;
+            background: #fff3cd;
+            border-radius: 5px;
+            font-size: 0.9em;
+        }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>💬 Q&A Session Report</h1>
+        <p>Document Question & Answer Analysis</p>
+        <p><strong>Session Date:</strong> {session.get('session_date', 'Unknown')}</p>
+    </div>
+    <div class="summary">
+        <div class="summary-card">
+            <h3>Total Questions</h3>
+            <div style="font-size: 2em; font-weight: bold; color: #6f42c1;">
+                {session.get('total_questions', 0)}
+            </div>
+        </div>
+        <div class="summary-card">
+            <h3>Successful Answers</h3>
+            <div style="font-size: 2em; font-weight: bold; color: #28a745;">
+                {summary.get('successful_answers', 0)}
+            </div>
+        </div>
+        <div class="summary-card">
+            <h3>Avg Response Length</h3>
+            <div style="font-size: 2em; font-weight: bold; color: #17a2b8;">
+                {summary.get('average_response_length', 0):.0f}
+            </div>
+        </div>
+    </div>
+    <h2>📝 Q&A Interactions</h2>
+    """
+            # Add individual Q&A items
+            for i, interaction in enumerate(interactions, 1):
+                question = interaction.get('query', 'No question')
+                answer = interaction.get('result', interaction.get('answer', 'No answer'))
+                sources = interaction.get('sources', [])
+                html_content += f"""
+    <div class="qa-item">
+        <h3>Question {i}</h3>
+        <div class="question">
+            <strong>Q:</strong> {question}
+        </div>
+        <div class="answer">
+            <strong>A:</strong> {answer}
+        </div>
+        """
+                if sources:
+                    html_content += '<div class="sources"><strong>Sources:</strong><ul>'
+                    for source in sources[:3]:  # Limit to first 3 sources
+                        content_preview = source.get('content', '')[:200] + '...' if len(source.get('content', '')) > 200 else source.get('content', '')
+                        html_content += f'<li>{content_preview}</li>'
+                    html_content += '</ul></div>'
+                html_content += '</div>'
+            html_content += f"""
+    <div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
+        <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+    </div>
+</body>
+</html>
+            """
+            return html_content
+        except Exception as e:
+            return f"<html><body><h1>Q&A HTML Export Error</h1><p>{str(e)}</p></body></html>"
+    def _export_qa_csv(self, data: Dict[str, Any]) -> str:
+        """Export Q&A results as CSV"""
+        try:
+            output = io.StringIO()
+            writer = csv.writer(output)
+            session = data.get('qa_session', {})
+            summary = data.get('summary', {})
+            interactions = session.get('interactions', [])
+            # Header
+            writer.writerow(['Q&A Session Report'])
+            writer.writerow(['Session Date:', session.get('session_date', 'Unknown')])
+            writer.writerow(['Total Questions:', session.get('total_questions', 0)])
+            writer.writerow(['Successful Answers:', summary.get('successful_answers', 0)])
+            writer.writerow([])
+            # Q&A data
+            writer.writerow(['Question Index', 'Question', 'Answer', 'Has Sources', 'Answer Length'])
+            for i, interaction in enumerate(interactions, 1):
+                question = interaction.get('query', 'No question')
+                answer = interaction.get('result', interaction.get('answer', 'No answer'))
+                has_sources = 'Yes' if interaction.get('sources') else 'No'
+                answer_length = len(answer) if answer else 0
+                writer.writerow([i, question, answer, has_sources, answer_length])
+            return output.getvalue()
+        except Exception as e:
+            return f"Q&A CSV export error: {str(e)}"
+    def _export_batch_excel(self, data: Dict[str, Any]) -> bytes:
+        """Export batch results as Excel file"""
+        try:
+            output = io.BytesIO()
+            with pd.ExcelWriter(output, engine='openpyxl') as writer:
+                # Batch metadata sheet
+                metadata = data.get('batch_metadata', {})
+                metadata_df = pd.DataFrame([
+                    {'Property': k, 'Value': v} for k, v in metadata.items()
+                ])
+                metadata_df.to_excel(writer, sheet_name='Batch Metadata', index=False)
+                # Batch summary sheet
+                summary = data.get('batch_summary', {})
+                summary_df = pd.DataFrame([
+                    {'Metric': k, 'Value': v} for k, v in summary.items()
+                ])
+                summary_df.to_excel(writer, sheet_name='Batch Summary', index=False)
+                # Individual results sheet
+                results = data.get('batch_results', [])
+                if results:
+                    # Flatten results for tabular format
+                    flattened_results = []
+                    for i, result in enumerate(results):
+                        flat_result = {'Batch_Index': i}
+                        self._flatten_dict(result, flat_result)
+                        flattened_results.append(flat_result)
+                    results_df = pd.DataFrame(flattened_results)
+                    results_df.to_excel(writer, sheet_name='Batch Results', index=False)
+            output.seek(0)
+            return output.getvalue()
+        except Exception as e:
+            error_content = f"Batch Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
+            return error_content.encode('utf-8')
+    def _export_batch_csv(self, data: Dict[str, Any]) -> str:
+        """Export batch results as CSV"""
+        try:
+            output = io.StringIO()
+            writer = csv.writer(output)
+            # Batch metadata
+            metadata = data.get('batch_metadata', {})
+            writer.writerow(['Batch Analysis Results'])
+            writer.writerow(['Export Timestamp:', data.get('export_timestamp', 'Unknown')])
+            writer.writerow([])
+            writer.writerow(['Batch Metadata'])
+            for key, value in metadata.items():
+                writer.writerow([key, value])
+            writer.writerow([])
+            # Batch summary
+            summary = data.get('batch_summary', {})
+            writer.writerow(['Batch Summary'])
+            for key, value in summary.items():
+                writer.writerow([key, value])
+            writer.writerow([])
+            # Individual results (simplified)
+            results = data.get('batch_results', [])
+            if results:
+                writer.writerow(['Individual Results'])
+                writer.writerow(['Index', 'Status', 'Summary'])
+                for i, result in enumerate(results):
+                    status = 'Success' if not result.get('error') else 'Error'
+                    summary_text = str(result)[:100] + '...' if len(str(result)) > 100 else str(result)
+                    writer.writerow([i, status, summary_text])
+            return output.getvalue()
+        except Exception as e:
+            return f"Batch CSV export error: {str(e)}"
+    def _export_geo_pdf(self, data: Dict[str, Any]) -> bytes:
+        """Export GEO data as PDF (placeholder - would need reportlab)"""
+        try:
+            # For now, return HTML content as bytes
+            # In a full implementation, you'd use reportlab or weasyprint
+            html_content = self._export_geo_html(data)
+            return html_content.encode('utf-8')
+        except Exception as e:
+            error_content = f"PDF export not fully implemented. Error: {str(e)}"
+            return error_content.encode('utf-8')
+    def _create_executive_summary(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create executive summary report"""
+        try:
+            geo_results = analysis_data.get('geo_results', [])
+            enhancement_results = analysis_data.get('enhancement_results', {})
+            qa_results = analysis_data.get('qa_results', [])
+            # Calculate key metrics
+            overall_performance = self._calculate_overall_performance(analysis_data)
+            return {
+                'executive_summary': {
+                    'overall_performance_score': overall_performance,
+                    'key_findings': self._extract_key_findings(analysis_data),
+                    'priority_recommendations': self._get_priority_recommendations(analysis_data),
+                    'roi_potential': self._estimate_roi_potential(overall_performance),
+                    'implementation_timeline': self._suggest_implementation_timeline(analysis_data),
+                    'resource_requirements': self._estimate_resource_requirements(analysis_data)
+                }
+            }
+        except Exception as e:
+            return {'error': f"Executive summary creation failed: {str(e)}"}
+    def _create_summary_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create summary report"""
+        try:
+            return {
+                'summary_report': {
+                    'analysis_overview': self._create_analysis_overview(analysis_data),
+                    'performance_metrics': self._summarize_performance_metrics(analysis_data),
+                    'improvement_opportunities': self._identify_improvement_opportunities(analysis_data),
+                    'competitive_position': self._assess_competitive_position(analysis_data),
+                    'next_steps': self._recommend_next_steps(analysis_data)
+                }
+            }
+        except Exception as e:
+            return {'error': f"Summary report creation failed: {str(e)}"}
+    def _create_full_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create full detailed report"""
+        try:
+            return {
+                'full_report': {
+                    'executive_summary': self._create_executive_summary(analysis_data).get('executive_summary', {}),
+                    'detailed_analysis': {
+                        'geo_analysis_details': analysis_data.get('geo_results', []),
+                        'content_optimization_details': analysis_data.get('enhancement_results', {}),
+                        'qa_performance_details': analysis_data.get('qa_results', [])
+                    },
+                    'methodology': self._document_methodology(),
+                    'data_sources': self._document_data_sources(analysis_data),
+                    'limitations': self._document_limitations(),
+                    'appendices': self._create_appendices(analysis_data)
+                }
+            }
+        except Exception as e:
+            return {'error': f"Full report creation failed: {str(e)}"}
+    def _create_batch_summary(self, batch_results: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Create summary of batch processing results"""
+        try:
+            total_items = len(batch_results)
+            successful_items = len([r for r in batch_results if not r.get('error')])
+            failed_items = total_items - successful_items
+            return {
+                'total_items': total_items,
+                'successful_items': successful_items,
+                'failed_items': failed_items,
+                'success_rate': (successful_items / total_items * 100) if total_items > 0 else 0,
+                'processing_status': 'Completed',
+                'average_processing_time': self._calculate_avg_processing_time(batch_results),
+                'common_errors': self._identify_common_errors(batch_results)
+            }
+        except Exception as e:
+            return {'error': f"Batch summary creation failed: {str(e)}"}
+    def _generate_performance_insights(self, scores: Dict[str, float], overall_avg: float) -> List[str]:
+        """Generate performance insights from scores"""
+        insights = []
+        try:
+            # Overall performance insight
+            if overall_avg >= 8.0:
+                insights.append("Excellent overall GEO performance - content is well-optimized for AI search engines")
+            elif overall_avg >= 6.0:
+                insights.append("Good GEO performance with room for improvement in specific areas")
+            elif overall_avg >= 4.0:
+                insights.append("Moderate GEO performance - significant optimization opportunities exist")
+            else:
+                insights.append("Low GEO performance - comprehensive optimization needed")
+            # Specific metric insights
+            for metric, score in scores.items():
+                if score < 5.0:
+                    metric_name = metric.replace('_', ' ').title()
+                    insights.append(f"Low {metric_name} score ({score:.1f}) needs immediate attention")
+                elif score >= 8.5:
+                    metric_name = metric.replace('_', ' ').title()
+                    insights.append(f"Excellent {metric_name} score ({score:.1f}) - maintain current approach")
+            return insights[:5]  # Return top 5 insights
+        except Exception:
+            return ["Unable to generate performance insights"]
+    def _generate_package_readme(self, analysis_data: Dict[str, Any]) -> str:
+        """Generate README file for export package"""
+        try:
+            readme_content = f"""
+GEO SEO AI Optimizer - Analysis Package
+======================================
+Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+This package contains the complete analysis results from the GEO SEO AI Optimizer tool.
+Files Included:
+- JSON file: Complete raw data in JSON format
+- HTML file: Visual report for web viewing
+- CSV file: Tabular data for spreadsheet analysis
+- README.txt: This file
+About GEO (Generative Engine Optimization):
+GEO is the practice of optimizing content for AI-powered search engines and
+language models. Unlike traditional SEO, GEO focuses on:
+- AI search visibility
+- Query intent matching
+- Conversational readiness
+- Citation worthiness
+- Semantic richness
+- Context completeness
+How to Use These Files:
+1. Open the HTML file in a web browser for a visual report
+2. Import the CSV file into Excel or Google Sheets for analysis
+3. Use the JSON file for programmatic processing or integration
+For more information about GEO optimization, visit the tool documentation.
+Generated by: GEO SEO AI Optimizer v1.0
+"""
+            return readme_content
+        except Exception as e:
+            return f"README generation failed: {str(e)}"
+    # Helper methods for data processing and analysis
+    def _get_performance_level(self, score: float) -> str:
+        """Get performance level description for a score"""
+        if score >= 8.0:
+            return "Excellent"
+        elif score >= 6.0:
+            return "Good"
+        elif score >= 4.0:
+            return "Fair"
+        else:
+            return "Needs Improvement"
+    def _categorize_recommendation(self, recommendation: str) -> str:
+        """Categorize a recommendation based on content"""
+        rec_lower = recommendation.lower()
+        if any(word in rec_lower for word in ['structure', 'heading', 'format']):
+            return "Content Structure"
+        elif any(word in rec_lower for word in ['keyword', 'semantic', 'topic']):
+            return "SEO & Keywords"
+        elif any(word in rec_lower for word in ['clarity', 'readability', 'language']):
+            return "Content Quality"
+        elif any(word in rec_lower for word in ['technical', 'schema', 'markup']):
+            return "Technical SEO"
+        else:
+            return "General"
+    def _calculate_avg_response_length(self, qa_results: List[Dict[str, Any]]) -> float:
+        """Calculate average response length for Q&A results"""
+        try:
+            response_lengths = []
+            for result in qa_results:
+                answer = result.get('result', result.get('answer', ''))
+                if answer and not result.get('error'):
+                    response_lengths.append(len(answer))
+            return sum(response_lengths) / len(response_lengths) if response_lengths else 0
+        except Exception:
+            return 0
+    def _extract_common_topics(self, qa_results: List[Dict[str, Any]]) -> List[str]:
+        """Extract common topics from Q&A results"""
+        try:
+            # Simple topic extraction based on question keywords
+            topics = {}
+            for result in qa_results:
+                question = result.get('query', result.get('question', ''))
+                if question:
+                    words = question.lower().split()
+                    for word in words:
+                        if len(word) > 4:  # Focus on longer words
+                            topics[word] = topics.get(word, 0) + 1
+            # Return top 5 most common topics
+            sorted_topics = sorted(topics.items(), key=lambda x: x[1], reverse=True)
+            return [topic for topic, count in sorted_topics[:5]]
+        except Exception:
+            return []
+    def _flatten_dict(self, d: Dict[str, Any], parent_dict: Dict[str, Any], parent_key: str = '') -> None:
+        """Flatten nested dictionary for tabular export"""
+        try:
+            for key, value in d.items():
+                new_key = f"{parent_key}_{key}" if parent_key else key
+                if isinstance(value, dict):
+                    self._flatten_dict(value, parent_dict, new_key)
+                elif isinstance(value, list):
+                    parent_dict[new_key] = json.dumps(value)  # Convert lists to JSON strings
+                else:
+                    parent_dict[new_key] = value
+        except Exception:
+            pass  # Skip problematic keys
+    def _calculate_overall_performance(self, analysis_data: Dict[str, Any]) -> float:
+        """Calculate overall performance score across all analyses"""
+        try:
+            scores = []
+            # GEO scores
+            geo_results = analysis_data.get('geo_results', [])
+            for result in geo_results:
+                if 'geo_scores' in result:
+                    geo_score_values = list(result['geo_scores'].values())
+                    if geo_score_values:
+                        scores.append(sum(geo_score_values) / len(geo_score_values))
+            # Enhancement scores
+            enhancement = analysis_data.get('enhancement_results', {})
+            if 'scores' in enhancement:
+                enh_scores = list(enhancement['scores'].values())
+                if enh_scores:
+                    scores.append(sum(enh_scores) / len(enh_scores))
+            return sum(scores) / len(scores) if scores else 0
+        except Exception:
+            return 0
+    def _extract_key_findings(self, analysis_data: Dict[str, Any]) -> List[str]:
+        """Extract key findings from analysis data"""
+        findings = []
+        try:
+            # Add findings based on performance scores
+            overall_perf = self._calculate_overall_performance(analysis_data)
+            if overall_perf >= 8.0:
+                findings.append("Content demonstrates excellent AI search optimization")
+            elif overall_perf <= 4.0:
+                findings.append("Significant optimization opportunities identified")
+            # Add more specific findings based on data
+            geo_results = analysis_data.get('geo_results', [])
+            if geo_results:
+                findings.append(f"Analyzed {len(geo_results)} pages for GEO performance")
+            enhancement = analysis_data.get('enhancement_results', {})
+            if enhancement and 'keywords' in enhancement:
+                findings.append(f"Identified {len(enhancement['keywords'])} key optimization terms")
+            return findings[:5]  # Return top 5 findings
+        except Exception:
+            return ["Unable to extract key findings"]
+    def _get_priority_recommendations(self, analysis_data: Dict[str, Any]) -> List[str]:
+        """Get priority recommendations from analysis"""
+        try:
+            recommendations = []
+            # Collect all recommendations from different analyses
+            geo_results = analysis_data.get('geo_results', [])
+            for result in geo_results:
+                recommendations.extend(result.get('recommendations', []))
+            # Remove duplicates and return top priorities
+            unique_recs = list(set(recommendations))
+            return unique_recs[:3]  # Top 3 priority recommendations
+        except Exception:
+            return ["Review and implement GEO best practices"]
+    def _estimate_roi_potential(self, performance_score: float) -> str:
+        """Estimate ROI potential based on performance score"""
+        if performance_score <= 4.0:
+            return "High - Significant improvement potential"
+        elif performance_score <= 6.0:
+            return "Medium - Moderate improvement opportunities"
+        else:
+            return "Low - Already well-optimized"
+    def _suggest_implementation_timeline(self, analysis_data: Dict[str, Any]) -> str:
+        """Suggest implementation timeline"""
+        try:
+            overall_perf = self._calculate_overall_performance(analysis_data)
+            if overall_perf <= 4.0:
+                return "3-6 months for comprehensive optimization"
+            elif overall_perf <= 6.0:
+                return "1-3 months for targeted improvements"
+            else:
+                return "Ongoing maintenance and monitoring"
+        except Exception:
+            return "Timeline assessment unavailable"
+    def _estimate_resource_requirements(self, analysis_data: Dict[str, Any]) -> Dict[str, str]:
+        """Estimate resource requirements"""
+        return {
+            'content_team': 'Required for content optimization',
+            'technical_team': 'Required for technical implementations',
+            'timeline': self._suggest_implementation_timeline(analysis_data),
+            'budget': 'Varies based on scope of optimizations'
+        }
+    def _create_analysis_overview(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create analysis overview"""
+        try:
+            return {
+                'analyses_performed': list(analysis_data.keys()),
+                'total_items_analyzed': sum(len(v) if isinstance(v, list) else 1 for v in analysis_data.values()),
+                'analysis_scope': 'Comprehensive GEO and content optimization analysis',
+                'key_focus_areas': ['AI Search Optimization', 'Content Enhancement', 'Performance Analysis']
+            }
+        except Exception:
+            return {'error': 'Overview creation failed'}
+    def _summarize_performance_metrics(self, analysis_data: Dict[str, Any]) -> Dict[str, float]:
+        """Summarize performance metrics"""
+        try:
+            return {
+                'overall_performance': self._calculate_overall_performance(analysis_data),
+                'optimization_potential': 10 - self._calculate_overall_performance(analysis_data),
+                'completion_rate': 100.0  # Assuming analysis completed successfully
+            }
+        except Exception:
+            return {}
+    def _identify_improvement_opportunities(self, analysis_data: Dict[str, Any]) -> List[str]:
+        """Identify improvement opportunities"""
+        return self._get_priority_recommendations(analysis_data)
+    def _assess_competitive_position(self, analysis_data: Dict[str, Any]) -> str:
+        """Assess competitive position"""
+        try:
+            overall_perf = self._calculate_overall_performance(analysis_data)
+            if overall_perf >= 8.0:
+                return "Strong - Above average GEO performance"
+            elif overall_perf >= 6.0:
+                return "Competitive - Meeting industry standards"
+            elif overall_perf >= 4.0:
+                return "Below Average - Improvement needed"
+            else:
+                return "Weak - Significant optimization required"
+        except Exception:
+            return "Assessment unavailable"
+    def _recommend_next_steps(self, analysis_data: Dict[str, Any]) -> List[str]:
+        """Recommend next steps"""
+        steps = [
+            "Review detailed analysis results",
+            "Prioritize recommendations by impact",
+            "Develop implementation plan",
+            "Monitor performance improvements"
+        ]
+        # Add specific steps based on performance
+        overall_perf = self._calculate_overall_performance(analysis_data)
+        if overall_perf <= 4.0:
+            steps.insert(1, "Focus on fundamental GEO optimization")
+        return steps
+    def _document_methodology(self) -> Dict[str, str]:
+        """Document analysis methodology"""
+        return {
+            'geo_analysis': 'AI-powered content analysis using specialized GEO metrics',
+            'content_optimization': 'LLM-based content enhancement and scoring',
+            'performance_scoring': 'Multi-dimensional scoring system for AI search optimization',
+            'data_collection': 'Automated content parsing and analysis',
+            'validation': 'Cross-referenced metrics and quality assurance checks'
+        }
+    def _document_data_sources(self, analysis_data: Dict[str, Any]) -> List[str]:
+        """Document data sources used in analysis"""
+        sources = []
+        if 'geo_results' in analysis_data:
+            sources.append("Website content analysis")
+        if 'enhancement_results' in analysis_data:
+            sources.append("Content optimization analysis")
+        if 'qa_results' in analysis_data:
+            sources.append("Document Q&A interactions")
+        sources.extend([
+            "AI-powered content scoring",
+            "GEO performance metrics",
+            "Industry best practices database"
+        ])
+        return sources
+    def _document_limitations(self) -> List[str]:
+        """Document analysis limitations"""
+        return [
+            "Analysis based on current content snapshot",
+            "Performance may vary with search engine algorithm updates",
+            "Recommendations require human review for implementation",
+            "Results depend on quality of input content",
+            "AI model performance may vary across different content types"
+        ]
+    def _create_appendices(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Create report appendices"""
+        try:
+            return {
+                'technical_details': {
+                    'models_used': ['GPT-based content analysis', 'Semantic similarity scoring'],
+                    'processing_time': 'Variable based on content volume',
+                    'confidence_intervals': 'Scores provided with ±0.5 accuracy'
+                },
+                'glossary': {
+                    'GEO': 'Generative Engine Optimization - optimization for AI search engines',
+                    'AI Search Visibility': 'Likelihood of content appearing in AI search results',
+                    'Citation Worthiness': 'Probability of content being cited by AI systems',
+                    'Conversational Readiness': 'Suitability for AI chat responses'
+                },
+                'references': [
+                    'GEO Best Practices Guide',
+                    'AI Search Engine Optimization Standards',
+                    'Content Performance Benchmarks'
+                ]
+            }
+        except Exception:
+            return {}
+    def _calculate_avg_processing_time(self, batch_results: List[Dict[str, Any]]) -> float:
+        """Calculate average processing time for batch results"""
+        try:
+            processing_times = []
+            for result in batch_results:
+                if 'processing_time' in result:
+                    processing_times.append(result['processing_time'])
+            return sum(processing_times) / len(processing_times) if processing_times else 0
+        except Exception:
+            return 0
+    def _identify_common_errors(self, batch_results: List[Dict[str, Any]]) -> List[str]:
+        """Identify common errors in batch processing"""
+        try:
+            error_counts = {}
+            for result in batch_results:
+                if result.get('error'):
+                    error_msg = str(result['error'])[:50]  # First 50 chars
+                    error_counts[error_msg] = error_counts.get(error_msg, 0) + 1
+            # Return top 3 most common errors
+            sorted_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)
+            return [error for error, count in sorted_errors[:3]]
+        except Exception:
+            return []
+class DataValidator:
+    """Helper class for validating export data"""
+    @staticmethod
+    def validate_geo_data(geo_results: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Validate GEO analysis data structure"""
+        validation_result = {
+            'valid': True,
+            'errors': [],
+            'warnings': []
+        }
+        try:
+            if not geo_results:
+                validation_result['errors'].append("No GEO results provided")
+                validation_result['valid'] = False
+                return validation_result
+            for i, result in enumerate(geo_results):
+                # Check required fields
+                if 'geo_scores' not in result:
+                    validation_result['warnings'].append(f"Result {i} missing geo_scores")
+                if 'page_data' not in result:
+                    validation_result['warnings'].append(f"Result {i} missing page_data")
+                # Validate score ranges
+                if 'geo_scores' in result:
+                    for metric, score in result['geo_scores'].items():
+                        if not isinstance(score, (int, float)) or score < 0 or score > 10:
+                            validation_result['errors'].append(f"Invalid score for {metric} in result {i}")
+                            validation_result['valid'] = False
+            return validation_result
+        except Exception as e:
+            validation_result['errors'].append(f"Validation failed: {str(e)}")
+            validation_result['valid'] = False
+            return validation_result
+    @staticmethod
+    def validate_enhancement_data(enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
+        """Validate content enhancement data structure"""
+        validation_result = {
+            'valid': True,
+            'errors': [],
+            'warnings': []
+        }
+        try:
+            # Check for required fields
+            if 'scores' not in enhancement_result:
+                validation_result['warnings'].append("Enhancement result missing scores")
+            # Validate score structure
+            if 'scores' in enhancement_result:
+                scores = enhancement_result['scores']
+                required_scores = ['clarity', 'structuredness', 'answerability']
+                for req_score in required_scores:
+                    if req_score not in scores:
+                        validation_result['warnings'].append(f"Missing {req_score} score")
+                    elif not isinstance(scores[req_score], (int, float)):
+                        validation_result['errors'].append(f"Invalid {req_score} score type")
+                        validation_result['valid'] = False
+            return validation_result
+        except Exception as e:
+            validation_result['errors'].append(f"Enhancement validation failed: {str(e)}")
+            validation_result['valid'] = False
+            return validation_result
+class ExportManager:
+    """High-level export management class"""
+    def __init__(self):
+        self.exporter = ResultExporter()
+        self.validator = DataValidator()
+        self.export_history = []
+    def export_with_validation(self, data: Dict[str, Any], data_type: str,
+                              format_type: str = 'json') -> Dict[str, Any]:
+        """Export data with validation"""
+        try:
+            # Validate data first
+            if data_type == 'geo_analysis':
+                validation = self.validator.validate_geo_data(data.get('geo_results', []))
+            elif data_type == 'content_optimization':
+                validation = self.validator.validate_enhancement_data(data)
+            else:
+                validation = {'valid': True, 'errors': [], 'warnings': []}
+            # Proceed with export if validation passes
+            if validation['valid']:
+                if data_type == 'geo_analysis':
+                    result = self.exporter.export_geo_results(
+                        data.get('geo_results', []),
+                        data.get('website_url', 'unknown'),
+                        format_type
+                    )
+                elif data_type == 'content_optimization':
+                    result = self.exporter.export_enhancement_results(data, format_type)
+                else:
+                    result = json.dumps(data, indent=2, ensure_ascii=False)
+                # Log export
+                self.export_history.append({
+                    'timestamp': datetime.now().isoformat(),
+                    'data_type': data_type,
+                    'format_type': format_type,
+                    'validation_warnings': validation.get('warnings', []),
+                    'success': True
+                })
+                return {
+                    'success': True,
+                    'data': result,
+                    'validation': validation
+                }
+            else:
+                return {
+                    'success': False,
+                    'error': 'Data validation failed',
+                    'validation': validation
+                }
+        except Exception as e:
+            self.export_history.append({
+                'timestamp': datetime.now().isoformat(),
+                'data_type': data_type,
+                'format_type': format_type,
+                'success': False,
+                'error': str(e)
+            })
+            return {
+                'success': False,
+                'error': f"Export failed: {str(e)}"
+            }
+    def get_export_history(self) -> List[Dict[str, Any]]:
+        """Get export history"""
+        return self.export_history
+    def clear_export_history(self) -> None:
+        """Clear export history"""
+        self.export_history.clear()
+    def get_supported_formats(self) -> Dict[str, List[str]]:
+        """Get supported export formats by data type"""
+        return {
+            'geo_analysis': ['json', 'csv', 'html', 'xlsx', 'pdf'],
+            'content_optimization': ['json', 'html', 'csv'],
+            'qa_results': ['json', 'html', 'csv'],
+            'batch_analysis': ['json', 'xlsx', 'csv']
+        }
+    def create_multi_format_export(self, data: Dict[str, Any], data_type: str,
+                                  formats: List[str] = None) -> Dict[str, Any]:
+        """Create export in multiple formats"""
+        if formats is None:
+            formats = ['json', 'html', 'csv']
+        results = {}
+        for format_type in formats:
+            try:
+                export_result = self.export_with_validation(data, data_type, format_type)
+                if export_result['success']:
+                    results[format_type] = export_result['data']
+                else:
+                    results[format_type] = {'error': export_result['error']}
+            except Exception as e:
+                results[format_type] = {'error': str(e)}
+        return {
+            'multi_format_export': results,
+            'formats_generated': list(results.keys()),
+            'successful_formats': [fmt for fmt, data in results.items() if 'error' not in data]
+        }
+# Utility functions for the export module
+def create_export_template(data_type: str) -> Dict[str, Any]:
+    """Create export template for different data types"""
+    templates = {
+        'geo_analysis': {
+            'website_url': 'https://example.com',
+            'geo_results': [
+                {
+                    'page_data': {
+                        'url': 'https://example.com/page1',
+                        'title': 'Example Page',
+                        'word_count': 500
+                    },
+                    'geo_scores': {
+                        'ai_search_visibility': 7.5,
+                        'query_intent_matching': 6.8,
+                        'conversational_readiness': 8.2,
+                        'citation_worthiness': 7.1
+                    },
+                    'recommendations': [
+                        'Improve content structure',
+                        'Add more specific examples'
+                    ]
+                }
+            ]
+        },
+        'content_optimization': {
+            'scores': {
+                'clarity': 7.5,
+                'structuredness': 6.8,
+                'answerability': 8.2
+            },
+            'keywords': ['example', 'optimization', 'content'],
+            'optimized_text': 'This is the optimized version of the content...',
+            'optimization_suggestions': [
+                'Improve sentence structure',
+                'Add more specific keywords'
+            ]
+        },
+        'qa_results': [
+            {
+                'query': 'What is the main topic?',
+                'result': 'The main topic is content optimization for AI systems.',
+                'sources': [
+                    {
+                        'content': 'Source document content...',
+                        'metadata': {'source': 'document1.pdf'}
+                    }
+                ]
+            }
+        ]
+    }
+    return templates.get(data_type, {})
+def export_demo_data() -> Dict[str, Any]:
+    """Export demonstration data for testing"""
+    demo_data = {
+        'geo_analysis_demo': create_export_template('geo_analysis'),
+        'content_optimization_demo': create_export_template('content_optimization'),
+        'qa_results_demo': create_export_template('qa_results')
+    }
+    return demo_data
+# Export the main classes and functions
+__all__ = [
+    'ResultExporter',
+    'GEOReport',
+    'ContentAnalysis',
+    'DataValidator',
+    'ExportManager',
+    'create_export_template',
+    'export_demo_data'
+]
+# Example usage for testing
+if __name__ == "__main__":
+    # Create exporter instance
+    exporter = ResultExporter()
+    # Test with demo data
+    demo_geo_data = create_export_template('geo_analysis')
+    # Export in different formats
+    json_export = exporter.export_geo_results(
+        demo_geo_data['geo_results'],
+        demo_geo_data['website_url'],
+        'json'
+    )
+    html_export = exporter.export_geo_results(
+        demo_geo_data['geo_results'],
+        demo_geo_data['website_url'],
+        'html'
+    )
+    print("JSON Export:", json_export[:200] + "..." if len(str(json_export)) > 200 else json_export)
+    print("\nHTML Export:", html_export[:200] + "..." if len(str(html_export)) > 200 else html_export)
+    # Test enhancement export
+    demo_enhancement = create_export_template('content_optimization')
+    enhancement_export = exporter.export_enhancement_results(demo_enhancement, 'json')
+    print("\nEnhancement Export:", enhancement_export[:200] + "..." if len(str(enhancement_export)) > 200 else enhancement_export)

utils/optimizer.py ADDED Viewed

	@@ -0,0 +1,558 @@

+"""
+Content Optimization Module
+Enhances content for better AI/LLM performance and GEO scores
+"""
+import json
+import re
+from typing import Dict, Any, List, Optional
+from langchain.prompts import ChatPromptTemplate
+class ContentOptimizer:
+    """Main class for optimizing content for AI search engines"""
+    def __init__(self, llm):
+        self.llm = llm
+        self.setup_prompts()
+    def setup_prompts(self):
+        """Initialize optimization prompts"""
+        # Main content enhancement prompt
+        self.enhancement_prompt = """You are an AI Content Enhancement Specialist. Your purpose is to optimize user-provided text to maximize its effectiveness for large language models (LLMs) in search, question-answering, and conversational AI systems.
+Evaluate the input text based on the following criteria, assigning a score from 1–10 for each:
+Clarity: How easily can the content be understood?
+Structuredness: How well-organized and coherent is the content?
+LLM Answerability: How easily can an LLM extract precise answers from the content?
+Identify the most salient keywords.
+Rewrite the text to improve:
+- Clarity and precision
+- Logical structure and flow
+- Suitability for LLM-based information retrieval
+Present your analysis and optimized text in the following JSON format:
+```json
+{
+"scores": {
+"clarity": 8.5,
+"structuredness": 7.0,
+"answerability": 9.0
+},
+"keywords": ["example", "installation", "setup"],
+"optimized_text": "..."
+}
+```"""
+        # SEO-style optimization prompt
+        self.seo_style_prompt = """You are an AI-first SEO specialist. Optimize this content for AI search engines and LLM systems.
+Focus on:
+1. Semantic keyword optimization
+2. Question-answer format enhancement
+3. Factual accuracy and authority signals
+4. Conversational readiness
+5. Citation-worthy structure
+Provide analysis and optimization in JSON:
+```json
+{
+"seo_analysis": {
+"keyword_density": "analysis of current keywords",
+"semantic_gaps": ["missing semantic terms"],
+"readability_score": 8.5,
+"authority_signals": ["credentials", "citations"]
+},
+"optimized_content": {
+"title_suggestions": ["optimized title 1", "optimized title 2"],
+"meta_description": "AI-optimized meta description",
+"enhanced_content": "full optimized content...",
+"structured_data_suggestions": ["schema markup recommendations"]
+},
+"improvement_summary": {
+"changes_made": ["change 1", "change 2"],
+"expected_impact": "description of expected improvements"
+}
+}
+```"""
+        # Competitive content analysis prompt
+        self.competitive_analysis_prompt = """Compare this content against best practices for AI search optimization. Identify gaps and opportunities.
+Original Content: {content}
+Analyze against these AI search factors:
+- Entity recognition and linking
+- Question coverage completeness
+- Factual statement clarity
+- Conversational flow
+- Semantic relationship mapping
+Provide competitive analysis in JSON format with specific recommendations."""
+    def optimize_content(self, content: str, analyze_only: bool = False,
+                        include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]:
+        """
+        Main content optimization function
+        Args:
+            content (str): Content to optimize
+            analyze_only (bool): If True, only analyze without rewriting
+            include_keywords (bool): Whether to include keyword analysis
+            optimization_type (str): Type of optimization ("standard", "seo", "competitive")
+        Returns:
+            Dict: Optimization results with scores and enhanced content
+        """
+        try:
+            # Choose optimization approach
+            if optimization_type == "seo":
+                return self._seo_style_optimization(content, analyze_only)
+            elif optimization_type == "competitive":
+                return self._competitive_optimization(content)
+            else:
+                return self._standard_optimization(content, analyze_only, include_keywords)
+        except Exception as e:
+            return {'error': f"Optimization failed: {str(e)}"}
+    def _standard_optimization(self, content: str, analyze_only: bool, include_keywords: bool) -> Dict[str, Any]:
+        """Standard content optimization using enhancement prompt"""
+        try:
+            # Modify prompt based on options
+            prompt_text = self.enhancement_prompt
+            if analyze_only:
+                prompt_text = prompt_text.replace(
+                    "Rewrite the text to improve:",
+                    "Analyze the text for potential improvements in:"
+                ).replace(
+                    '"optimized_text": "..."',
+                    '"optimization_suggestions": ["suggestion 1", "suggestion 2"]'
+                )
+            if not include_keywords:
+                prompt_text = prompt_text.replace(
+                    '"keywords": ["example", "installation", "setup"],',
+                    ''
+                )
+            # Create and run chain
+            prompt_template = ChatPromptTemplate.from_messages([
+                ("system", prompt_text),
+                ("user", content[:6000])  # Limit content length
+            ])
+            chain = prompt_template | self.llm
+            result = chain.invoke({})
+            # Parse result
+            result_content = result.content if hasattr(result, 'content') else str(result)
+            parsed_result = self._parse_optimization_result(result_content)
+            # Add metadata
+            parsed_result.update({
+                'optimization_type': 'standard',
+                'analyze_only': analyze_only,
+                'original_length': len(content),
+                'original_word_count': len(content.split())
+            })
+            return parsed_result
+        except Exception as e:
+            return {'error': f"Standard optimization failed: {str(e)}"}
+    def _seo_style_optimization(self, content: str, analyze_only: bool) -> Dict[str, Any]:
+        """SEO-focused optimization for AI search engines"""
+        try:
+            prompt_template = ChatPromptTemplate.from_messages([
+                ("system", self.seo_style_prompt),
+                ("user", f"Optimize this content for AI search engines:\n\n{content[:6000]}")
+            ])
+            chain = prompt_template | self.llm
+            result = chain.invoke({})
+            result_content = result.content if hasattr(result, 'content') else str(result)
+            parsed_result = self._parse_optimization_result(result_content)
+            # Add SEO-specific metadata
+            parsed_result.update({
+                'optimization_type': 'seo',
+                'analyze_only': analyze_only,
+                'seo_focused': True
+            })
+            return parsed_result
+        except Exception as e:
+            return {'error': f"SEO optimization failed: {str(e)}"}
+    def _competitive_optimization(self, content: str) -> Dict[str, Any]:
+        """Competitive analysis-based optimization"""
+        try:
+            formatted_prompt = self.competitive_analysis_prompt.format(content=content[:5000])
+            prompt_template = ChatPromptTemplate.from_messages([
+                ("system", formatted_prompt),
+                ("user", "Perform the competitive analysis and provide optimization recommendations.")
+            ])
+            chain = prompt_template | self.llm
+            result = chain.invoke({})
+            result_content = result.content if hasattr(result, 'content') else str(result)
+            parsed_result = self._parse_optimization_result(result_content)
+            parsed_result.update({
+                'optimization_type': 'competitive',
+                'competitive_analysis': True
+            })
+            return parsed_result
+        except Exception as e:
+            return {'error': f"Competitive optimization failed: {str(e)}"}
+    def batch_optimize_content(self, content_list: List[str], optimization_type: str = "standard") -> List[Dict[str, Any]]:
+        """
+        Optimize multiple pieces of content in batch
+        Args:
+            content_list (List[str]): List of content pieces to optimize
+            optimization_type (str): Type of optimization to apply
+        Returns:
+            List[Dict]: List of optimization results
+        """
+        results = []
+        for i, content in enumerate(content_list):
+            try:
+                result = self.optimize_content(
+                    content,
+                    optimization_type=optimization_type
+                )
+                result['batch_index'] = i
+                results.append(result)
+            except Exception as e:
+                results.append({
+                    'batch_index': i,
+                    'error': f"Batch optimization failed: {str(e)}"
+                })
+        return results
+    def generate_content_variations(self, content: str, num_variations: int = 3) -> List[Dict[str, Any]]:
+        """
+        Generate multiple optimized variations of the same content
+        Args:
+            content (str): Original content
+            num_variations (int): Number of variations to generate
+        Returns:
+            List[Dict]: List of content variations with analysis
+        """
+        variations = []
+        variation_prompts = [
+            "Create a more conversational version optimized for AI chat responses",
+            "Create a more authoritative version optimized for citations",
+            "Create a more structured version optimized for question-answering"
+        ]
+        for i in range(min(num_variations, len(variation_prompts))):
+            try:
+                custom_prompt = f"""You are optimizing content for AI systems. {variation_prompts[i]}.
+Original content: {content[:4000]}
+Provide the optimized variation in JSON format:
+```json
+{{
+"variation_type": "conversational/authoritative/structured",
+"optimized_content": "the rewritten content...",
+"key_changes": ["change 1", "change 2"],
+"target_use_case": "description of ideal use case"
+}}
+```"""
+                prompt_template = ChatPromptTemplate.from_messages([
+                    ("system", custom_prompt),
+                    ("user", "Generate the variation.")
+                ])
+                chain = prompt_template | self.llm
+                result = chain.invoke({})
+                result_content = result.content if hasattr(result, 'content') else str(result)
+                parsed_result = self._parse_optimization_result(result_content)
+                parsed_result.update({
+                    'variation_index': i,
+                    'variation_prompt': variation_prompts[i]
+                })
+                variations.append(parsed_result)
+            except Exception as e:
+                variations.append({
+                    'variation_index': i,
+                    'error': f"Variation generation failed: {str(e)}"
+                })
+        return variations
+    def analyze_content_readability(self, content: str) -> Dict[str, Any]:
+        """
+        Analyze content readability for AI systems
+        Args:
+            content (str): Content to analyze
+        Returns:
+            Dict: Readability analysis results
+        """
+        try:
+            # Basic readability metrics
+            words = content.split()
+            sentences = re.split(r'[.!?]+', content)
+            sentences = [s.strip() for s in sentences if s.strip()]
+            paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
+            # Calculate metrics
+            avg_words_per_sentence = len(words) / len(sentences) if sentences else 0
+            avg_sentences_per_paragraph = len(sentences) / len(paragraphs) if paragraphs else 0
+            # Character-based metrics
+            avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
+            # Complexity indicators
+            long_sentences = [s for s in sentences if len(s.split()) > 20]
+            complex_words = [w for w in words if len(w) > 6]
+            return {
+                'basic_metrics': {
+                    'total_words': len(words),
+                    'total_sentences': len(sentences),
+                    'total_paragraphs': len(paragraphs),
+                    'avg_words_per_sentence': avg_words_per_sentence,
+                    'avg_sentences_per_paragraph': avg_sentences_per_paragraph,
+                    'avg_word_length': avg_word_length
+                },
+                'complexity_indicators': {
+                    'long_sentences_count': len(long_sentences),
+                    'long_sentences_percentage': len(long_sentences) / len(sentences) * 100 if sentences else 0,
+                    'complex_words_count': len(complex_words),
+                    'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
+                },
+                'ai_readability_score': self._calculate_ai_readability_score({
+                    'avg_words_per_sentence': avg_words_per_sentence,
+                    'avg_word_length': avg_word_length,
+                    'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
+                }),
+                'recommendations': self._generate_readability_recommendations({
+                    'avg_words_per_sentence': avg_words_per_sentence,
+                    'long_sentences_percentage': len(long_sentences) / len(sentences) * 100 if sentences else 0,
+                    'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
+                })
+            }
+        except Exception as e:
+            return {'error': f"Readability analysis failed: {str(e)}"}
+    def extract_key_entities(self, content: str) -> Dict[str, Any]:
+        """
+        Extract key entities and topics for optimization
+        Args:
+            content (str): Content to analyze
+        Returns:
+            Dict: Extracted entities and topics
+        """
+        try:
+            entity_prompt = """Extract key entities, topics, and concepts from this content for AI optimization.
+Content: {content}
+Identify:
+1. Named entities (people, places, organizations)
+2. Key concepts and topics
+3. Technical terms and jargon
+4. Potential semantic keywords
+5. Question-answer opportunities
+Format as JSON:
+```json
+{{
+"named_entities": ["entity1", "entity2"],
+"key_topics": ["topic1", "topic2"],
+"technical_terms": ["term1", "term2"],
+"semantic_keywords": ["keyword1", "keyword2"],
+"question_opportunities": ["What is...", "How does..."],
+"entity_relationships": ["relationship descriptions"]
+}}
+```"""
+            prompt_template = ChatPromptTemplate.from_messages([
+                ("system", entity_prompt.format(content=content[:5000])),
+                ("user", "Extract the entities and topics.")
+            ])
+            chain = prompt_template | self.llm
+            result = chain.invoke({})
+            result_content = result.content if hasattr(result, 'content') else str(result)
+            return self._parse_optimization_result(result_content)
+        except Exception as e:
+            return {'error': f"Entity extraction failed: {str(e)}"}
+    def optimize_for_voice_search(self, content: str) -> Dict[str, Any]:
+        """
+        Optimize content specifically for voice search and conversational AI
+        Args:
+            content (str): Content to optimize
+        Returns:
+            Dict: Voice search optimization results
+        """
+        try:
+            voice_prompt = """Optimize this content for voice search and conversational AI systems.
+Focus on:
+1. Natural language patterns
+2. Question-based structure
+3. Conversational tone
+4. Clear, direct answers
+5. Featured snippet optimization
+Original content: {content}
+Provide optimization in JSON:
+```json
+{{
+"voice_optimized_content": "conversational version...",
+"question_answer_pairs": [
+  {{"question": "What is...", "answer": "Direct answer..."}},
+  {{"question": "How does...", "answer": "Step by step..."}}
+],
+"featured_snippet_candidates": ["snippet 1", "snippet 2"],
+"natural_language_improvements": ["improvement 1", "improvement 2"],
+"conversational_score": 8.5
+}}
+```"""
+            prompt_template = ChatPromptTemplate.from_messages([
+                ("system", voice_prompt.format(content=content[:4000])),
+                ("user", "Optimize for voice search.")
+            ])
+            chain = prompt_template | self.llm
+            result = chain.invoke({})
+            result_content = result.content if hasattr(result, 'content') else str(result)
+            parsed_result = self._parse_optimization_result(result_content)
+            parsed_result.update({
+                'optimization_type': 'voice_search',
+                'voice_optimized': True
+            })
+            return parsed_result
+        except Exception as e:
+            return {'error': f"Voice search optimization failed: {str(e)}"}
+    def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]:
+        """Parse LLM response and extract structured results"""
+        try:
+            # Find JSON content in the response
+            json_start = response_text.find('{')
+            json_end = response_text.rfind('}') + 1
+            if json_start != -1 and json_end != -1:
+                json_str = response_text[json_start:json_end]
+                parsed = json.loads(json_str)
+                # Ensure consistent structure
+                if 'scores' not in parsed and 'score' in parsed:
+                    parsed['scores'] = parsed['score']
+                return parsed
+            else:
+                # If no JSON found, return raw response with error flag
+                return {
+                    'raw_response': response_text,
+                    'parsing_error': 'No JSON structure found in response',
+                    'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
+                }
+        except json.JSONDecodeError as e:
+            return {
+                'raw_response': response_text,
+                'parsing_error': f'JSON decode error: {str(e)}',
+                'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
+            }
+        except Exception as e:
+            return {
+                'raw_response': response_text,
+                'parsing_error': f'Unexpected parsing error: {str(e)}',
+                'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
+            }
+    def _calculate_ai_readability_score(self, metrics: Dict[str, float]) -> float:
+        """Calculate AI-specific readability score"""
+        try:
+            # Optimal ranges for AI consumption
+            optimal_words_per_sentence = 15  # Sweet spot for AI processing
+            optimal_word_length = 5  # Balance of complexity and clarity
+            optimal_complex_words_percentage = 15  # Some complexity is good for authority
+            # Calculate deviations from optimal
+            sentence_score = max(0, 10 - abs(metrics['avg_words_per_sentence'] - optimal_words_per_sentence) * 0.5)
+            word_length_score = max(0, 10 - abs(metrics['avg_word_length'] - optimal_word_length) * 2)
+            complexity_score = max(0, 10 - abs(metrics['complex_words_percentage'] - optimal_complex_words_percentage) * 0.3)
+            # Weighted average
+            overall_score = (sentence_score * 0.4 + word_length_score * 0.3 + complexity_score * 0.3)
+            return round(overall_score, 1)
+        except Exception:
+            return 5.0  # Default neutral score
+    def _generate_readability_recommendations(self, metrics: Dict[str, float]) -> List[str]:
+        """Generate specific readability improvement recommendations"""
+        recommendations = []
+        try:
+            if metrics['avg_words_per_sentence'] > 20:
+                recommendations.append("Break down long sentences for better AI processing")
+            elif metrics['avg_words_per_sentence'] < 8:
+                recommendations.append("Consider combining very short sentences for better context")
+            if metrics['long_sentences_percentage'] > 30:
+                recommendations.append("Reduce the number of complex sentences (>20 words)")
+            if metrics['complex_words_percentage'] > 25:
+                recommendations.append("Simplify vocabulary where possible for broader accessibility")
+            elif metrics['complex_words_percentage'] < 5:
+                recommendations.append("Add more specific terminology to establish authority")
+            return recommendations
+        except Exception:
+            return ["Unable to generate specific recommendations"]

utils/parser.py ADDED Viewed

	@@ -0,0 +1,549 @@

+"""
+Content Parsing Module
+Handles extraction of content from PDFs, text, and webpages
+"""
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from typing import List, Dict, Any
+import time
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.schema import Document
+class BaseParser:
+    """Base class for all content parsers"""
+    def __init__(self):
+        self.supported_formats = []
+    def parse(self, source: str) -> List[Document]:
+        """Parse content from source and return LangChain Documents"""
+        raise NotImplementedError("Subclasses must implement parse method")
+    def validate_source(self, source: str) -> bool:
+        """Validate if the source can be processed"""
+        return True
+class PDFParser(BaseParser):
+    """Parser for PDF documents"""
+    def __init__(self):
+        super().__init__()
+        self.supported_formats = ['.pdf']
+    def parse(self, pdf_path: str) -> List[Document]:
+        """
+        Parse PDF file and return list of Document objects
+        Args:
+            pdf_path (str): Path to the PDF file
+        Returns:
+            List[Document]: List of parsed documents with metadata
+        """
+        try:
+            loader = PyPDFLoader(pdf_path)
+            documents = loader.load_and_split()
+            # Add additional metadata
+            for i, doc in enumerate(documents):
+                doc.metadata.update({
+                    'source_type': 'pdf',
+                    'page_number': i + 1,
+                    'total_pages': len(documents),
+                    'parser': 'PDFParser'
+                })
+            return documents
+        except Exception as e:
+            raise Exception(f"Error parsing PDF: {str(e)}")
+    def get_pdf_metadata(self, pdf_path: str) -> Dict[str, Any]:
+        """Extract metadata from PDF file"""
+        try:
+            loader = PyPDFLoader(pdf_path)
+            documents = loader.load()
+            total_pages = len(documents)
+            total_words = sum(len(doc.page_content.split()) for doc in documents)
+            return {
+                'total_pages': total_pages,
+                'total_words': total_words,
+                'average_words_per_page': total_words / total_pages if total_pages > 0 else 0,
+                'file_type': 'PDF',
+                'parser_used': 'PyPDFLoader'
+            }
+        except Exception as e:
+            return {'error': f"Could not extract metadata: {str(e)}"}
+class TextParser(BaseParser):
+    """Parser for plain text content"""
+    def __init__(self):
+        super().__init__()
+        self.supported_formats = ['.txt', 'plain_text']
+        self.chunk_size = 1000  # Default chunk size for long texts
+    def parse(self, text_content: str, chunk_size: int = None) -> List[Document]:
+        """
+        Parse text content and return list of Document objects
+        Args:
+            text_content (str): Raw text content
+            chunk_size (int): Optional chunk size for splitting long texts
+        Returns:
+            List[Document]: List of documents, potentially chunked
+        """
+        try:
+            if not text_content.strip():
+                raise ValueError("Empty text content provided")
+            chunk_size = chunk_size or self.chunk_size
+            # If text is short, return as single document
+            if len(text_content) <= chunk_size:
+                doc = Document(
+                    page_content=text_content,
+                    metadata={
+                        'source_type': 'text',
+                        'word_count': len(text_content.split()),
+                        'char_count': len(text_content),
+                        'chunk_index': 0,
+                        'total_chunks': 1,
+                        'parser': 'TextParser'
+                    }
+                )
+                return [doc]
+            # Split long text into chunks
+            chunks = self._split_text_into_chunks(text_content, chunk_size)
+            documents = []
+            for i, chunk in enumerate(chunks):
+                doc = Document(
+                    page_content=chunk,
+                    metadata={
+                        'source_type': 'text',
+                        'word_count': len(chunk.split()),
+                        'char_count': len(chunk),
+                        'chunk_index': i,
+                        'total_chunks': len(chunks),
+                        'parser': 'TextParser'
+                    }
+                )
+                documents.append(doc)
+            return documents
+        except Exception as e:
+            raise Exception(f"Error parsing text: {str(e)}")
+    def _split_text_into_chunks(self, text: str, chunk_size: int) -> List[str]:
+        """Split text into chunks while preserving sentence boundaries"""
+        sentences = text.split('. ')
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            # Add sentence to current chunk if it fits
+            test_chunk = current_chunk + sentence + ". "
+            if len(test_chunk) <= chunk_size:
+                current_chunk = test_chunk
+            else:
+                # Start new chunk if current chunk has content
+                if current_chunk.strip():
+                    chunks.append(current_chunk.strip())
+                current_chunk = sentence + ". "
+        # Add final chunk if it has content
+        if current_chunk.strip():
+            chunks.append(current_chunk.strip())
+        return chunks
+    def analyze_text_structure(self, text_content: str) -> Dict[str, Any]:
+        """Analyze the structure and characteristics of text content"""
+        try:
+            lines = text_content.split('\n')
+            words = text_content.split()
+            sentences = text_content.split('.')
+            # Count different elements
+            paragraphs = [p.strip() for p in text_content.split('\n\n') if p.strip()]
+            return {
+                'total_words': len(words),
+                'total_sentences': len([s for s in sentences if s.strip()]),
+                'total_lines': len(lines),
+                'total_paragraphs': len(paragraphs),
+                'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
+                'average_sentences_per_paragraph': len(sentences) / len(paragraphs) if paragraphs else 0,
+                'character_count': len(text_content),
+                'reading_time_minutes': len(words) / 200,  # Assuming 200 words per minute
+                'complexity_score': self._calculate_text_complexity(text_content)
+            }
+        except Exception as e:
+            return {'error': f"Could not analyze text structure: {str(e)}"}
+    def _calculate_text_complexity(self, text: str) -> float:
+        """Calculate a simple text complexity score"""
+        words = text.split()
+        sentences = [s for s in text.split('.') if s.strip()]
+        if not sentences:
+            return 0.0
+        # Average words per sentence (higher = more complex)
+        avg_words_per_sentence = len(words) / len(sentences)
+        # Average characters per word (higher = more complex)
+        avg_chars_per_word = sum(len(word) for word in words) / len(words) if words else 0
+        # Simple complexity score (normalized to 1-10 scale)
+        complexity = (avg_words_per_sentence * 0.1) + (avg_chars_per_word * 0.5)
+        return min(complexity, 10.0)
+class WebpageParser(BaseParser):
+    """Parser for web content"""
+    def __init__(self):
+        super().__init__()
+        self.supported_formats = ['http', 'https']
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        self.timeout = 10
+        self.max_retries = 3
+    def parse_website(self, url: str, max_pages: int = 1, include_subpages: bool = False) -> List[Dict[str, Any]]:
+        """
+        Parse website content and return structured data
+        Args:
+            url (str): Website URL to parse
+            max_pages (int): Maximum number of pages to parse
+            include_subpages (bool): Whether to include subpages
+        Returns:
+            List[Dict]: List of page data with content and metadata
+        """
+        try:
+            pages_data = []
+            urls_to_process = [url]
+            processed_urls = set()
+            # If including subpages, find additional URLs
+            if include_subpages and max_pages > 1:
+                subpage_urls = self._find_subpages(url, max_pages - 1)
+                urls_to_process.extend(subpage_urls)
+            # Process each URL
+            for current_url in urls_to_process[:max_pages]:
+                if current_url in processed_urls:
+                    continue
+                page_data = self._parse_single_page(current_url)
+                if page_data:
+                    pages_data.append(page_data)
+                    processed_urls.add(current_url)
+                # Add small delay to be respectful
+                time.sleep(1)
+            return pages_data
+        except Exception as e:
+            raise Exception(f"Error parsing website: {str(e)}")
+    def _parse_single_page(self, url: str) -> Dict[str, Any]:
+        """Parse a single webpage and extract content"""
+        try:
+            # Make request with retries
+            response = None
+            for attempt in range(self.max_retries):
+                try:
+                    response = requests.get(url, headers=self.headers, timeout=self.timeout)
+                    response.raise_for_status()
+                    break
+                except requests.RequestException as e:
+                    if attempt == self.max_retries - 1:
+                        raise e
+                    time.sleep(2 ** attempt)  # Exponential backoff
+            if not response:
+                return None
+            # Parse HTML content
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove unwanted elements
+            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
+                element.decompose()
+            # Extract main content
+            main_content = self._extract_main_content(soup)
+            # Extract metadata
+            title = self._extract_title(soup)
+            description = self._extract_description(soup)
+            headings = self._extract_headings(soup)
+            links = self._extract_links(soup, url)
+            # Clean and process text
+            cleaned_text = self._clean_text_content(main_content)
+            return {
+                'url': url,
+                'title': title,
+                'description': description,
+                'content': cleaned_text,
+                'headings': headings,
+                'internal_links': links['internal'],
+                'external_links': links['external'],
+                'word_count': len(cleaned_text.split()),
+                'char_count': len(cleaned_text),
+                'meta_keywords': self._extract_meta_keywords(soup),
+                'images': self._extract_images(soup, url),
+                'parser': 'WebpageParser',
+                'parsed_at': time.strftime('%Y-%m-%d %H:%M:%S')
+            }
+        except Exception as e:
+            return {'url': url, 'error': f"Failed to parse page: {str(e)}"}
+    def _extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract the main content from the page"""
+        # Try to find main content in order of preference
+        content_selectors = [
+            'main',
+            'article',
+            '[role="main"]',
+            '.content',
+            '.main-content',
+            '#content',
+            '#main',
+            '.post-content',
+            '.entry-content'
+        ]
+        for selector in content_selectors:
+            element = soup.select_one(selector)
+            if element:
+                return element.get_text(separator=' ', strip=True)
+        # Fallback to body content
+        body = soup.find('body')
+        if body:
+            return body.get_text(separator=' ', strip=True)
+        return soup.get_text(separator=' ', strip=True)
+    def _extract_title(self, soup: BeautifulSoup) -> str:
+        """Extract page title"""
+        title_tag = soup.find('title')
+        if title_tag:
+            return title_tag.get_text().strip()
+        # Fallback to h1
+        h1 = soup.find('h1')
+        if h1:
+            return h1.get_text().strip()
+        return "No Title Found"
+    def _extract_description(self, soup: BeautifulSoup) -> str:
+        """Extract meta description"""
+        meta_desc = soup.find('meta', attrs={'name': 'description'})
+        if meta_desc and meta_desc.get('content'):
+            return meta_desc['content'].strip()
+        # Fallback to Open Graph description
+        og_desc = soup.find('meta', attrs={'property': 'og:description'})
+        if og_desc and og_desc.get('content'):
+            return og_desc['content'].strip()
+        return "No Description Found"
+    def _extract_headings(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
+        """Extract all headings with their hierarchy"""
+        headings = []
+        for i in range(1, 7):  # h1 to h6
+            for heading in soup.find_all(f'h{i}'):
+                text = heading.get_text(strip=True)
+                if text:
+                    headings.append({
+                        'level': i,
+                        'text': text,
+                        'id': heading.get('id', ''),
+                        'class': heading.get('class', [])
+                    })
+        return headings
+    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[str]]:
+        """Extract internal and external links"""
+        internal_links = []
+        external_links = []
+        base_domain = urlparse(base_url).netloc
+        for link in soup.find_all('a', href=True):
+            href = link['href']
+            full_url = urljoin(base_url, href)
+            parsed_url = urlparse(full_url)
+            if parsed_url.netloc == base_domain:
+                internal_links.append(full_url)
+            elif parsed_url.netloc:  # External link with domain
+                external_links.append(full_url)
+        return {
+            'internal': list(set(internal_links)),
+            'external': list(set(external_links))
+        }
+    def _extract_meta_keywords(self, soup: BeautifulSoup) -> List[str]:
+        """Extract meta keywords if available"""
+        meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
+        if meta_keywords and meta_keywords.get('content'):
+            keywords = meta_keywords['content'].split(',')
+            return [kw.strip() for kw in keywords if kw.strip()]
+        return []
+    def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str, str]]:
+        """Extract image information"""
+        images = []
+        for img in soup.find_all('img'):
+            src = img.get('src')
+            if src:
+                full_url = urljoin(base_url, src)
+                images.append({
+                    'src': full_url,
+                    'alt': img.get('alt', ''),
+                    'title': img.get('title', '')
+                })
+        return images
+    def _clean_text_content(self, text: str) -> str:
+        """Clean and normalize text content"""
+        if not text:
+            return ""
+        # Split into lines and clean each line
+        lines = text.split('\n')
+        cleaned_lines = []
+        for line in lines:
+            line = line.strip()
+            if line and len(line) > 1:  # Skip empty lines and single characters
+                cleaned_lines.append(line)
+        # Join lines with single spaces
+        cleaned_text = ' '.join(cleaned_lines)
+        # Remove multiple spaces
+        while '  ' in cleaned_text:
+            cleaned_text = cleaned_text.replace('  ', ' ')
+        return cleaned_text
+    def _find_subpages(self, url: str, max_subpages: int) -> List[str]:
+        """Find subpages from the main page"""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            base_domain = urlparse(url).netloc
+            subpages = set()
+            # Find internal links
+            for link in soup.find_all('a', href=True):
+                href = link['href']
+                full_url = urljoin(url, href)
+                parsed_url = urlparse(full_url)
+                # Only include internal links from same domain
+                if (parsed_url.netloc == base_domain and
+                    full_url != url and
+                    not any(ext in full_url.lower() for ext in ['.pdf', '.jpg', '.png', '.gif', '.zip'])):
+                    subpages.add(full_url)
+                if len(subpages) >= max_subpages:
+                    break
+            return list(subpages)[:max_subpages]
+        except Exception:
+            return []
+    def validate_url(self, url: str) -> bool:
+        """Validate if URL is accessible"""
+        try:
+            response = requests.head(url, headers=self.headers, timeout=5)
+            return response.status_code == 200
+        except:
+            return False
+    def get_website_info(self, url: str) -> Dict[str, Any]:
+        """Get basic information about a website"""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=self.timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            return {
+                'url': url,
+                'title': self._extract_title(soup),
+                'description': self._extract_description(soup),
+                'meta_keywords': self._extract_meta_keywords(soup),
+                'has_robots_meta': bool(soup.find('meta', attrs={'name': 'robots'})),
+                'has_viewport_meta': bool(soup.find('meta', attrs={'name': 'viewport'})),
+                'language': soup.get('lang', 'unknown'),
+                'status_code': response.status_code,
+                'content_type': response.headers.get('content-type', 'unknown'),
+                'server': response.headers.get('server', 'unknown')
+            }
+        except Exception as e:
+            return {'url': url, 'error': f"Could not get website info: {str(e)}"}
+class ParserFactory:
+    """Factory class to create appropriate parsers"""
+    @staticmethod
+    def get_parser(source_type: str):
+        """Get the appropriate parser for the source type"""
+        parsers = {
+            'pdf': PDFParser(),
+            'text': TextParser(),
+            'webpage': WebpageParser(),
+            'url': WebpageParser()
+        }
+        return parsers.get(source_type.lower())
+    @staticmethod
+    def detect_source_type(source: str) -> str:
+        """Detect the type of content source"""
+        if source.startswith(('http://', 'https://')):
+            return 'webpage'
+        elif source.endswith('.pdf'):
+            return 'pdf'
+        else:
+            return 'text'

utils/scorer.py ADDED Viewed

	@@ -0,0 +1,501 @@

+"""
+GEO Scoring Module
+Analyzes content for Generative Engine Optimization (GEO) performance
+"""
+import json
+from typing import Dict, Any, List
+from langchain.prompts import ChatPromptTemplate
+class GEOScorer:
+    """Main class for calculating GEO scores and analysis"""
+    def __init__(self, llm):
+        self.llm = llm
+        self.setup_prompts()
+    def setup_prompts(self):
+        """Initialize prompts for different types of analysis"""
+        # Main GEO analysis prompt
+        self.geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided content for its effectiveness in AI-powered search engines and LLM systems.
+Evaluate the content based on these GEO criteria (score 1-10 each):
+1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
+2. **Query Intent Matching**: How well does the content match common user queries?
+3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
+4. **Conversational Readiness**: How suitable is the content for AI chat responses?
+5. **Semantic Richness**: How well does the content use relevant semantic keywords?
+6. **Context Completeness**: Does the content provide complete, self-contained answers?
+7. **Citation Worthiness**: How likely are AI systems to cite this content?
+8. **Multi-Query Coverage**: Does the content answer multiple related questions?
+Also identify:
+- Primary topics and entities
+- Missing information gaps
+- Optimization opportunities
+- Specific enhancement recommendations
+Format your response as JSON:
+```json
+{
+  "geo_scores": {
+    "ai_search_visibility": 7.5,
+    "query_intent_matching": 8.0,
+    "factual_accuracy": 9.0,
+    "conversational_readiness": 6.5,
+    "semantic_richness": 7.0,
+    "context_completeness": 8.5,
+    "citation_worthiness": 7.8,
+    "multi_query_coverage": 6.0
+  },
+  "overall_geo_score": 7.5,
+  "primary_topics": ["topic1", "topic2"],
+  "entities": ["entity1", "entity2"],
+  "missing_gaps": ["gap1", "gap2"],
+  "optimization_opportunities": [
+    {
+      "type": "semantic_enhancement",
+      "description": "Add more related terms",
+      "priority": "high"
+    }
+  ],
+  "recommendations": [
+    "Specific actionable recommendation 1",
+    "Specific actionable recommendation 2"
+  ]
+}
+```"""
+        # Quick scoring prompt for faster analysis
+        self.quick_score_prompt = """Analyze this content for AI search optimization. Provide scores (1-10) for:
+1. AI Search Visibility
+2. Query Intent Matching
+3. Conversational Readiness
+4. Citation Worthiness
+Respond in JSON format:
+```json
+{
+  "scores": {
+    "ai_search_visibility": 7.5,
+    "query_intent_matching": 8.0,
+    "conversational_readiness": 6.5,
+    "citation_worthiness": 7.8
+  },
+  "overall_score": 7.5,
+  "top_recommendation": "Most important improvement needed"
+}
+```"""
+        # Competitive analysis prompt
+        self.competitive_prompt = """Compare these content pieces for GEO performance. Identify which performs better for AI search and why.
+Content A: {content_a}
+Content B: {content_b}
+Provide analysis in JSON:
+```json
+{
+  "winner": "A" or "B",
+  "score_comparison": {
+    "content_a_score": 7.5,
+    "content_b_score": 8.2
+  },
+  "key_differences": ["difference1", "difference2"],
+  "improvement_suggestions": {
+    "content_a": ["suggestion1"],
+    "content_b": ["suggestion1"]
+  }
+}
+```"""
+    def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
+        """
+        Analyze a single page for GEO performance
+        Args:
+            content (str): Page content to analyze
+            title (str): Page title
+            detailed (bool): Whether to perform detailed analysis
+        Returns:
+            Dict: GEO analysis results
+        """
+        try:
+            # Choose prompt based on detail level
+            if detailed:
+                prompt_template = ChatPromptTemplate.from_messages([
+                    ("system", self.geo_analysis_prompt),
+                    ("user", f"Title: {title}\n\nContent: {content[:8000]}")  # Limit content length
+                ])
+            else:
+                prompt_template = ChatPromptTemplate.from_messages([
+                    ("system", self.quick_score_prompt),
+                    ("user", f"Title: {title}\n\nContent: {content[:4000]}")
+                ])
+            # Run analysis
+            chain = prompt_template | self.llm
+            result = chain.invoke({})
+            # Extract and parse result
+            result_content = result.content if hasattr(result, 'content') else str(result)
+            parsed_result = self._parse_llm_response(result_content)
+            # Add metadata
+            parsed_result.update({
+                'analyzed_title': title,
+                'content_length': len(content),
+                'word_count': len(content.split()),
+                'analysis_type': 'detailed' if detailed else 'quick'
+            })
+            return parsed_result
+        except Exception as e:
+            return {'error': f"GEO analysis failed: {str(e)}"}
+    def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
+        """
+        Analyze multiple pages and return consolidated results
+        Args:
+            pages_data (List[Dict]): List of page data with content and metadata
+            detailed (bool): Whether to perform detailed analysis
+        Returns:
+            List[Dict]: List of GEO analysis results
+        """
+        results = []
+        for i, page_data in enumerate(pages_data):
+            try:
+                content = page_data.get('content', '')
+                title = page_data.get('title', f'Page {i+1}')
+                analysis = self.analyze_page_geo(content, title, detailed)
+                # Add page-specific metadata
+                analysis.update({
+                    'page_url': page_data.get('url', ''),
+                    'page_index': i,
+                    'source_word_count': page_data.get('word_count', 0)
+                })
+                results.append(analysis)
+            except Exception as e:
+                results.append({
+                    'page_index': i,
+                    'page_url': page_data.get('url', ''),
+                    'error': f"Analysis failed: {str(e)}"
+                })
+        return results
+    def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
+        """
+        Compare two pieces of content for GEO performance
+        Args:
+            content_a (str): First content to compare
+            content_b (str): Second content to compare
+            titles (tuple): Optional titles for the content pieces
+        Returns:
+            Dict: Comparison analysis results
+        """
+        try:
+            title_a, title_b = titles if titles else ("Content A", "Content B")
+            prompt_template = ChatPromptTemplate.from_messages([
+                ("system", self.competitive_prompt),
+                ("user", "")
+            ])
+            # Format the competitive analysis prompt
+            formatted_prompt = self.competitive_prompt.format(
+                content_a=f"Title: {title_a}\nContent: {content_a[:4000]}",
+                content_b=f"Title: {title_b}\nContent: {content_b[:4000]}"
+            )
+            chain = ChatPromptTemplate.from_messages([
+                ("system", formatted_prompt),
+                ("user", "Perform the comparison analysis.")
+            ]) | self.llm
+            result = chain.invoke({})
+            result_content = result.content if hasattr(result, 'content') else str(result)
+            return self._parse_llm_response(result_content)
+        except Exception as e:
+            return {'error': f"Comparison analysis failed: {str(e)}"}
+    def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Calculate aggregate GEO scores from multiple page analyses
+        Args:
+            individual_results (List[Dict]): List of individual page analysis results
+        Returns:
+            Dict: Aggregate scores and insights
+        """
+        try:
+            valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
+            if not valid_results:
+                return {'error': 'No valid results to aggregate'}
+            # Calculate average scores
+            score_keys = list(valid_results[0]['geo_scores'].keys())
+            avg_scores = {}
+            for key in score_keys:
+                scores = [r['geo_scores'][key] for r in valid_results if key in r['geo_scores']]
+                avg_scores[key] = sum(scores) / len(scores) if scores else 0
+            overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
+            # Collect all recommendations and opportunities
+            all_recommendations = []
+            all_opportunities = []
+            all_topics = []
+            all_entities = []
+            for result in valid_results:
+                all_recommendations.extend(result.get('recommendations', []))
+                all_opportunities.extend(result.get('optimization_opportunities', []))
+                all_topics.extend(result.get('primary_topics', []))
+                all_entities.extend(result.get('entities', []))
+            # Remove duplicates and prioritize
+            unique_recommendations = list(set(all_recommendations))
+            unique_topics = list(set(all_topics))
+            unique_entities = list(set(all_entities))
+            # Find highest and lowest performing areas
+            best_score = max(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
+            worst_score = min(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
+            return {
+                'aggregate_scores': avg_scores,
+                'overall_score': overall_avg,
+                'pages_analyzed': len(valid_results),
+                'best_performing_metric': {
+                    'metric': best_score[0],
+                    'score': best_score[1]
+                },
+                'lowest_performing_metric': {
+                    'metric': worst_score[0],
+                    'score': worst_score[1]
+                },
+                'consolidated_recommendations': unique_recommendations[:10],
+                'all_topics': unique_topics,
+                'all_entities': unique_entities,
+                'high_priority_opportunities': [
+                    opp for opp in all_opportunities
+                    if opp.get('priority') == 'high'
+                ][:5],
+                'score_distribution': self._calculate_score_distribution(avg_scores)
+            }
+        except Exception as e:
+            return {'error': f"Aggregation failed: {str(e)}"}
+    def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
+        """
+        Generate a comprehensive GEO report
+        Args:
+            analysis_results (Dict): Results from aggregate analysis
+            website_url (str): Optional website URL for context
+        Returns:
+            Dict: Comprehensive GEO report
+        """
+        try:
+            report = {
+                'report_metadata': {
+                    'generated_at': self._get_timestamp(),
+                    'website_url': website_url,
+                    'analysis_type': 'GEO Performance Report'
+                },
+                'executive_summary': self._generate_executive_summary(analysis_results),
+                'detailed_scores': analysis_results.get('aggregate_scores', {}),
+                'performance_insights': self._generate_performance_insights(analysis_results),
+                'actionable_recommendations': self._prioritize_recommendations(
+                    analysis_results.get('consolidated_recommendations', [])
+                ),
+                'optimization_roadmap': self._create_optimization_roadmap(analysis_results),
+                'competitive_position': self._assess_competitive_position(analysis_results),
+                'technical_details': {
+                    'pages_analyzed': analysis_results.get('pages_analyzed', 0),
+                    'overall_score': analysis_results.get('overall_score', 0),
+                    'score_distribution': analysis_results.get('score_distribution', {})
+                }
+            }
+            return report
+        except Exception as e:
+            return {'error': f"Report generation failed: {str(e)}"}
+    def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
+        """Parse LLM response and extract JSON content"""
+        try:
+            # Find JSON content in the response
+            json_start = response_text.find('{')
+            json_end = response_text.rfind('}') + 1
+            if json_start != -1 and json_end != -1:
+                json_str = response_text[json_start:json_end]
+                return json.loads(json_str)
+            else:
+                # If no JSON found, return the raw response
+                return {'raw_response': response_text, 'parsing_error': 'No JSON found'}
+        except json.JSONDecodeError as e:
+            return {'raw_response': response_text, 'parsing_error': f'JSON decode error: {str(e)}'}
+        except Exception as e:
+            return {'raw_response': response_text, 'parsing_error': f'Unexpected error: {str(e)}'}
+    def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
+        """Calculate distribution of scores for insights"""
+        if not scores:
+            return {}
+        score_values = list(scores.values())
+        return {
+            'highest_score': max(score_values),
+            'lowest_score': min(score_values),
+            'average_score': sum(score_values) / len(score_values),
+            'score_range': max(score_values) - min(score_values),
+            'scores_above_7': len([s for s in score_values if s >= 7.0]),
+            'scores_below_5': len([s for s in score_values if s < 5.0])
+        }
+    def _generate_executive_summary(self, analysis_results: Dict[str, Any]) -> str:
+        """Generate executive summary based on analysis results"""
+        overall_score = analysis_results.get('overall_score', 0)
+        pages_analyzed = analysis_results.get('pages_analyzed', 0)
+        if overall_score >= 8.0:
+            performance = "excellent"
+        elif overall_score >= 6.5:
+            performance = "good"
+        elif overall_score >= 5.0:
+            performance = "moderate"
+        else:
+            performance = "needs improvement"
+        return f"Analysis of {pages_analyzed} pages shows {performance} GEO performance with an overall score of {overall_score:.1f}/10. Key opportunities exist in {analysis_results.get('lowest_performing_metric', {}).get('metric', 'multiple areas')}."
+    def _generate_performance_insights(self, analysis_results: Dict[str, Any]) -> List[str]:
+        """Generate performance insights based on analysis"""
+        insights = []
+        best_metric = analysis_results.get('best_performing_metric', {})
+        worst_metric = analysis_results.get('lowest_performing_metric', {})
+        if best_metric.get('score', 0) >= 8.0:
+            insights.append(f"Strong performance in {best_metric.get('metric', 'unknown')} (score: {best_metric.get('score', 0):.1f})")
+        if worst_metric.get('score', 10) < 6.0:
+            insights.append(f"Significant improvement needed in {worst_metric.get('metric', 'unknown')} (score: {worst_metric.get('score', 0):.1f})")
+        score_dist = analysis_results.get('score_distribution', {})
+        if score_dist.get('score_range', 0) > 3.0:
+            insights.append("High variability in scores indicates inconsistent optimization across metrics")
+        return insights
+    def _prioritize_recommendations(self, recommendations: List[str]) -> List[Dict[str, Any]]:
+        """Prioritize recommendations based on impact potential"""
+        prioritized = []
+        # Simple prioritization based on keywords
+        high_impact_keywords = ['semantic', 'structure', 'authority', 'factual']
+        medium_impact_keywords = ['readability', 'clarity', 'format']
+        for i, rec in enumerate(recommendations):
+            priority = 'low'
+            if any(keyword in rec.lower() for keyword in high_impact_keywords):
+                priority = 'high'
+            elif any(keyword in rec.lower() for keyword in medium_impact_keywords):
+                priority = 'medium'
+            prioritized.append({
+                'recommendation': rec,
+                'priority': priority,
+                'order': i + 1
+            })
+        # Sort by priority
+        priority_order = {'high': 1, 'medium': 2, 'low': 3}
+        prioritized.sort(key=lambda x: priority_order[x['priority']])
+        return prioritized
+    def _create_optimization_roadmap(self, analysis_results: Dict[str, Any]) -> Dict[str, List[str]]:
+        """Create a phased optimization roadmap"""
+        roadmap = {
+            'immediate_actions': [],
+            'short_term_goals': [],
+            'long_term_strategy': []
+        }
+        overall_score = analysis_results.get('overall_score', 0)
+        worst_metric = analysis_results.get('lowest_performing_metric', {})
+        # Immediate actions based on worst performing metric
+        if worst_metric.get('score', 10) < 5.0:
+            roadmap['immediate_actions'].append(f"Address critical issues in {worst_metric.get('metric', 'low-scoring areas')}")
+        # Short-term goals
+        if overall_score < 7.0:
+            roadmap['short_term_goals'].append("Improve overall GEO score to above 7.0")
+            roadmap['short_term_goals'].append("Enhance content structure and semantic richness")
+        # Long-term strategy
+        roadmap['long_term_strategy'].append("Establish consistent GEO optimization process")
+        roadmap['long_term_strategy'].append("Monitor and track AI search performance")
+        return roadmap
+    def _assess_competitive_position(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Assess competitive position based on scores"""
+        overall_score = analysis_results.get('overall_score', 0)
+        if overall_score >= 8.5:
+            position = "market_leader"
+            description = "Content is highly optimized for AI search engines"
+        elif overall_score >= 7.0:
+            position = "competitive"
+            description = "Content performs well but has room for improvement"
+        elif overall_score >= 5.5:
+            position = "average"
+            description = "Content meets basic standards but lacks optimization"
+        else:
+            position = "needs_work"
+            description = "Content requires significant optimization for AI search"
+        return {
+            'position': position,
+            'description': description,
+            'score': overall_score,
+            'percentile_estimate': min(overall_score * 10, 100)  # Rough percentile estimate
+        }
+    def _get_timestamp(self) -> str:
+        """Get current timestamp"""
+        from datetime import datetime
+        return datetime.now().strftime('%Y-%m-%d %H:%M:%S')