MBilal-72 commited on
Commit
c76bc58
·
verified ·
1 Parent(s): 67e514b

Upload Utils and its files

Browse files
Files changed (5) hide show
  1. utils/chunker.py +1314 -0
  2. utils/export.py +1896 -0
  3. utils/optimizer.py +558 -0
  4. utils/parser.py +549 -0
  5. utils/scorer.py +501 -0
utils/chunker.py ADDED
@@ -0,0 +1,1314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vector Chunking and RAG Module
3
+ Handles document chunking, vector embeddings, and RAG question-answering
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import numpy as np
9
+ from typing import Dict, Any, List, Optional, Tuple
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
11
+ from langchain.schema import Document
12
+ from langchain_community.vectorstores import FAISS, Chroma
13
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
14
+ from langchain.memory import ConversationBufferMemory
15
+ from langchain.prompts import PromptTemplate
16
+ import tempfile
17
+ import shutil
18
+
19
+
20
+ class VectorChunker:
21
+ """Main class for document chunking and vector operations"""
22
+
23
+ def __init__(self, embeddings_model, chunk_size: int = 1000, chunk_overlap: int = 200):
24
+ self.embeddings = embeddings_model
25
+ self.chunk_size = chunk_size
26
+ self.chunk_overlap = chunk_overlap
27
+ self.setup_text_splitters()
28
+ self.vector_stores = {} # Cache for vector stores
29
+
30
+ def setup_text_splitters(self):
31
+ """Initialize different text splitting strategies"""
32
+
33
+ # Default recursive splitter
34
+ self.recursive_splitter = RecursiveCharacterTextSplitter(
35
+ chunk_size=self.chunk_size,
36
+ chunk_overlap=self.chunk_overlap,
37
+ length_function=len,
38
+ separators=["\n\n", "\n", " ", ""]
39
+ )
40
+
41
+ # Character-based splitter
42
+ self.character_splitter = CharacterTextSplitter(
43
+ chunk_size=self.chunk_size,
44
+ chunk_overlap=self.chunk_overlap,
45
+ separator="\n\n"
46
+ )
47
+
48
+ # Semantic splitter for better context preservation
49
+ self.semantic_splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=800, # Smaller chunks for better semantic coherence
51
+ chunk_overlap=150,
52
+ length_function=len,
53
+ separators=["\n\n", "\n", ". ", " ", ""]
54
+ )
55
+
56
+ def chunk_documents(self, documents: List[Document], strategy: str = "recursive") -> List[Document]:
57
+ """
58
+ Chunk documents using specified strategy
59
+
60
+ Args:
61
+ documents (List[Document]): List of documents to chunk
62
+ strategy (str): Chunking strategy ("recursive", "character", "semantic")
63
+
64
+ Returns:
65
+ List[Document]: List of chunked documents
66
+ """
67
+ try:
68
+ # Choose splitter based on strategy
69
+ if strategy == "character":
70
+ splitter = self.character_splitter
71
+ elif strategy == "semantic":
72
+ splitter = self.semantic_splitter
73
+ else:
74
+ splitter = self.recursive_splitter
75
+
76
+ # Split documents
77
+ chunked_docs = []
78
+
79
+ for doc in documents:
80
+ chunks = splitter.split_documents([doc])
81
+
82
+ # Add chunk metadata
83
+ for i, chunk in enumerate(chunks):
84
+ chunk.metadata.update({
85
+ 'chunk_index': i,
86
+ 'total_chunks': len(chunks),
87
+ 'chunk_strategy': strategy,
88
+ 'original_source': doc.metadata.get('source', 'unknown'),
89
+ 'chunk_size': len(chunk.page_content),
90
+ 'chunk_word_count': len(chunk.page_content.split())
91
+ })
92
+
93
+ chunked_docs.extend(chunks)
94
+
95
+ return chunked_docs
96
+
97
+ except Exception as e:
98
+ raise Exception(f"Document chunking failed: {str(e)}")
99
+
100
+ def create_vector_store(self, documents: List[Document], store_type: str = "faiss",
101
+ persist_directory: Optional[str] = None) -> Any:
102
+ """
103
+ Create vector store from documents
104
+
105
+ Args:
106
+ documents (List[Document]): Documents to vectorize
107
+ store_type (str): Type of vector store ("faiss", "chroma")
108
+ persist_directory (str): Optional directory to persist the store
109
+
110
+ Returns:
111
+ Vector store instance
112
+ """
113
+ try:
114
+ if not documents:
115
+ raise ValueError("No documents provided for vector store creation")
116
+
117
+ if store_type.lower() == "chroma":
118
+ if persist_directory:
119
+ vector_store = Chroma.from_documents(
120
+ documents=documents,
121
+ embedding=self.embeddings,
122
+ persist_directory=persist_directory
123
+ )
124
+ vector_store.persist()
125
+ else:
126
+ vector_store = Chroma.from_documents(
127
+ documents=documents,
128
+ embedding=self.embeddings
129
+ )
130
+ else: # Default to FAISS
131
+ vector_store = FAISS.from_documents(
132
+ documents=documents,
133
+ embedding=self.embeddings
134
+ )
135
+
136
+ # Save FAISS index if persist directory provided
137
+ if persist_directory:
138
+ os.makedirs(persist_directory, exist_ok=True)
139
+ vector_store.save_local(persist_directory)
140
+
141
+ return vector_store
142
+
143
+ except Exception as e:
144
+ raise Exception(f"Vector store creation failed: {str(e)}")
145
+
146
+ def create_qa_chain(self, documents: List[Document], llm, chain_type: str = "stuff") -> RetrievalQA:
147
+ """
148
+ Create a Question-Answering chain from documents
149
+
150
+ Args:
151
+ documents (List[Document]): Documents for the knowledge base
152
+ llm: Language model for answering questions
153
+ chain_type (str): Type of QA chain ("stuff", "map_reduce", "refine")
154
+
155
+ Returns:
156
+ RetrievalQA: Configured QA chain
157
+ """
158
+ try:
159
+ # Chunk documents
160
+ chunked_docs = self.chunk_documents(documents, strategy="semantic")
161
+
162
+ # Create vector store
163
+ vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
164
+
165
+ # Create retriever
166
+ retriever = vector_store.as_retriever(
167
+ search_type="similarity",
168
+ search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks
169
+ )
170
+
171
+ # Custom prompt for GEO-focused QA
172
+ qa_prompt_template = """Use the following pieces of context to answer the question at the end.
173
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
174
+ Focus on providing clear, accurate, and complete answers that would be suitable for AI search engines.
175
+
176
+ Context:
177
+ {context}
178
+
179
+ Question: {question}
180
+
181
+ Answer:"""
182
+
183
+ qa_prompt = PromptTemplate(
184
+ template=qa_prompt_template,
185
+ input_variables=["context", "question"]
186
+ )
187
+
188
+ # Create QA chain
189
+ qa_chain = RetrievalQA.from_chain_type(
190
+ llm=llm,
191
+ chain_type=chain_type,
192
+ retriever=retriever,
193
+ return_source_documents=True,
194
+ chain_type_kwargs={"prompt": qa_prompt}
195
+ )
196
+
197
+ return qa_chain
198
+
199
+ except Exception as e:
200
+ raise Exception(f"QA chain creation failed: {str(e)}")
201
+
202
+ def create_conversational_chain(self, documents: List[Document], llm) -> ConversationalRetrievalChain:
203
+ """
204
+ Create a conversational retrieval chain with memory
205
+
206
+ Args:
207
+ documents (List[Document]): Documents for the knowledge base
208
+ llm: Language model for conversation
209
+
210
+ Returns:
211
+ ConversationalRetrievalChain: Configured conversational chain
212
+ """
213
+ try:
214
+ # Chunk documents
215
+ chunked_docs = self.chunk_documents(documents, strategy="semantic")
216
+
217
+ # Create vector store
218
+ vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
219
+
220
+ # Create retriever
221
+ retriever = vector_store.as_retriever(
222
+ search_type="similarity",
223
+ search_kwargs={"k": 3}
224
+ )
225
+
226
+ # Create memory
227
+ memory = ConversationBufferMemory(
228
+ memory_key="chat_history",
229
+ return_messages=True,
230
+ output_key="answer"
231
+ )
232
+
233
+ # Custom prompt for conversational QA
234
+ condense_question_prompt = """Given the following conversation and a follow up question,
235
+ rephrase the follow up question to be a standalone question that can be understood without the chat history.
236
+
237
+ Chat History:
238
+ {chat_history}
239
+ Follow Up Input: {question}
240
+ Standalone question:"""
241
+
242
+ # Create conversational chain
243
+ conv_chain = ConversationalRetrievalChain.from_llm(
244
+ llm=llm,
245
+ retriever=retriever,
246
+ memory=memory,
247
+ return_source_documents=True,
248
+ condense_question_prompt=PromptTemplate.from_template(condense_question_prompt)
249
+ )
250
+
251
+ return conv_chain
252
+
253
+ except Exception as e:
254
+ raise Exception(f"Conversational chain creation failed: {str(e)}")
255
+
256
+ def semantic_search(self, query: str, documents: List[Document], top_k: int = 5) -> List[Dict[str, Any]]:
257
+ """
258
+ Perform semantic search on documents
259
+
260
+ Args:
261
+ query (str): Search query
262
+ documents (List[Document]): Documents to search
263
+ top_k (int): Number of top results to return
264
+
265
+ Returns:
266
+ List[Dict]: Search results with scores
267
+ """
268
+ try:
269
+ # Chunk documents
270
+ chunked_docs = self.chunk_documents(documents, strategy="semantic")
271
+
272
+ # Create vector store
273
+ vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
274
+
275
+ # Perform similarity search with scores
276
+ results = vector_store.similarity_search_with_score(query, k=top_k)
277
+
278
+ # Format results
279
+ formatted_results = []
280
+ for doc, score in results:
281
+ result = {
282
+ 'content': doc.page_content,
283
+ 'metadata': doc.metadata,
284
+ 'similarity_score': float(score),
285
+ 'relevance_rank': len(formatted_results) + 1
286
+ }
287
+ formatted_results.append(result)
288
+
289
+ return formatted_results
290
+
291
+ except Exception as e:
292
+ raise Exception(f"Semantic search failed: {str(e)}")
293
+
294
+ def analyze_document_similarity(self, documents: List[Document]) -> Dict[str, Any]:
295
+ """
296
+ Analyze similarity between documents
297
+
298
+ Args:
299
+ documents (List[Document]): Documents to analyze
300
+
301
+ Returns:
302
+ Dict: Similarity analysis results
303
+ """
304
+ try:
305
+ if len(documents) < 2:
306
+ return {'error': 'Need at least 2 documents for similarity analysis'}
307
+
308
+ # Chunk documents
309
+ chunked_docs = self.chunk_documents(documents, strategy="semantic")
310
+
311
+ # Create embeddings for each document
312
+ doc_embeddings = []
313
+ doc_metadata = []
314
+
315
+ for doc in chunked_docs:
316
+ # Get embedding for the document
317
+ embedding = self.embeddings.embed_query(doc.page_content)
318
+ doc_embeddings.append(embedding)
319
+ doc_metadata.append({
320
+ 'content_preview': doc.page_content[:200] + "...",
321
+ 'metadata': doc.metadata,
322
+ 'length': len(doc.page_content)
323
+ })
324
+
325
+ # Calculate pairwise similarities
326
+ similarities = []
327
+ embeddings_array = np.array(doc_embeddings)
328
+
329
+ for i in range(len(embeddings_array)):
330
+ for j in range(i + 1, len(embeddings_array)):
331
+ # Calculate cosine similarity
332
+ similarity = np.dot(embeddings_array[i], embeddings_array[j]) / (
333
+ np.linalg.norm(embeddings_array[i]) * np.linalg.norm(embeddings_array[j])
334
+ )
335
+
336
+ similarities.append({
337
+ 'doc_1_index': i,
338
+ 'doc_2_index': j,
339
+ 'similarity_score': float(similarity),
340
+ 'doc_1_preview': doc_metadata[i]['content_preview'],
341
+ 'doc_2_preview': doc_metadata[j]['content_preview']
342
+ })
343
+
344
+ # Sort by similarity score
345
+ similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
346
+
347
+ # Calculate statistics
348
+ similarity_scores = [s['similarity_score'] for s in similarities]
349
+
350
+ return {
351
+ 'total_comparisons': len(similarities),
352
+ 'average_similarity': np.mean(similarity_scores),
353
+ 'max_similarity': max(similarity_scores),
354
+ 'min_similarity': min(similarity_scores),
355
+ 'similarity_distribution': {
356
+ 'high_similarity': len([s for s in similarity_scores if s > 0.8]),
357
+ 'medium_similarity': len([s for s in similarity_scores if 0.5 < s <= 0.8]),
358
+ 'low_similarity': len([s for s in similarity_scores if s <= 0.5])
359
+ },
360
+ 'top_similar_pairs': similarities[:5],
361
+ 'most_dissimilar_pairs': similarities[-3:]
362
+ }
363
+
364
+ except Exception as e:
365
+ return {'error': f"Similarity analysis failed: {str(e)}"}
366
+
367
+ def extract_key_passages(self, documents: List[Document], queries: List[str],
368
+ passages_per_query: int = 3) -> Dict[str, List[Dict[str, Any]]]:
369
+ """
370
+ Extract key passages from documents based on multiple queries
371
+
372
+ Args:
373
+ documents (List[Document]): Documents to search
374
+ queries (List[str]): List of queries to search for
375
+ passages_per_query (int): Number of passages to extract per query
376
+
377
+ Returns:
378
+ Dict: Key passages organized by query
379
+ """
380
+ try:
381
+ # Chunk documents
382
+ chunked_docs = self.chunk_documents(documents, strategy="semantic")
383
+
384
+ # Create vector store
385
+ vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
386
+
387
+ key_passages = {}
388
+
389
+ for query in queries:
390
+ # Search for relevant passages
391
+ results = vector_store.similarity_search_with_score(query, k=passages_per_query)
392
+
393
+ passages = []
394
+ for doc, score in results:
395
+ passage = {
396
+ 'content': doc.page_content,
397
+ 'relevance_score': float(score),
398
+ 'metadata': doc.metadata,
399
+ 'word_count': len(doc.page_content.split()),
400
+ 'query_match': query
401
+ }
402
+ passages.append(passage)
403
+
404
+ key_passages[query] = passages
405
+
406
+ return key_passages
407
+
408
+ except Exception as e:
409
+ return {'error': f"Key passage extraction failed: {str(e)}"}
410
+
411
+ def optimize_chunking_strategy(self, documents: List[Document],
412
+ test_queries: List[str]) -> Dict[str, Any]:
413
+ """
414
+ Test different chunking strategies and recommend the best one
415
+
416
+ Args:
417
+ documents (List[Document]): Documents to test
418
+ test_queries (List[str]): Queries to test retrieval performance
419
+
420
+ Returns:
421
+ Dict: Optimization results and recommendations
422
+ """
423
+ try:
424
+ strategies = ["recursive", "character", "semantic"]
425
+ strategy_results = {}
426
+
427
+ for strategy in strategies:
428
+ try:
429
+ # Test this strategy
430
+ chunked_docs = self.chunk_documents(documents, strategy=strategy)
431
+ vector_store = self.create_vector_store(chunked_docs, store_type="faiss")
432
+
433
+ # Test retrieval performance
434
+ retrieval_scores = []
435
+
436
+ for query in test_queries:
437
+ results = vector_store.similarity_search_with_score(query, k=3)
438
+
439
+ # Calculate average relevance score
440
+ if results:
441
+ avg_score = sum(score for _, score in results) / len(results)
442
+ retrieval_scores.append(float(avg_score))
443
+
444
+ # Calculate strategy metrics
445
+ avg_retrieval_score = np.mean(retrieval_scores) if retrieval_scores else 0
446
+ total_chunks = len(chunked_docs)
447
+ avg_chunk_size = np.mean([len(doc.page_content) for doc in chunked_docs])
448
+
449
+ strategy_results[strategy] = {
450
+ 'average_retrieval_score': avg_retrieval_score,
451
+ 'total_chunks': total_chunks,
452
+ 'average_chunk_size': avg_chunk_size,
453
+ 'retrieval_scores': retrieval_scores,
454
+ 'chunk_size_distribution': {
455
+ 'min': min(len(doc.page_content) for doc in chunked_docs),
456
+ 'max': max(len(doc.page_content) for doc in chunked_docs),
457
+ 'std': float(np.std([len(doc.page_content) for doc in chunked_docs]))
458
+ }
459
+ }
460
+
461
+ except Exception as e:
462
+ strategy_results[strategy] = {'error': f"Strategy test failed: {str(e)}"}
463
+
464
+ # Determine best strategy
465
+ valid_strategies = {k: v for k, v in strategy_results.items() if 'error' not in v}
466
+
467
+ if valid_strategies:
468
+ best_strategy = max(valid_strategies.keys(),
469
+ key=lambda k: valid_strategies[k]['average_retrieval_score'])
470
+
471
+ recommendation = {
472
+ 'recommended_strategy': best_strategy,
473
+ 'reason': f"Best average retrieval score: {valid_strategies[best_strategy]['average_retrieval_score']:.4f}",
474
+ 'all_results': strategy_results,
475
+ 'performance_summary': {
476
+ strategy: result.get('average_retrieval_score', 0)
477
+ for strategy, result in valid_strategies.items()
478
+ }
479
+ }
480
+ else:
481
+ recommendation = {
482
+ 'recommended_strategy': 'recursive', # Default fallback
483
+ 'reason': 'All strategies failed, using default',
484
+ 'all_results': strategy_results
485
+ }
486
+
487
+ return recommendation
488
+
489
+ except Exception as e:
490
+ return {'error': f"Chunking optimization failed: {str(e)}"}
491
+
492
+ def create_document_summary(self, documents: List[Document], llm,
493
+ summary_type: str = "extractive") -> Dict[str, Any]:
494
+ """
495
+ Create document summaries using the chunked content
496
+
497
+ Args:
498
+ documents (List[Document]): Documents to summarize
499
+ llm: Language model for summarization
500
+ summary_type (str): Type of summary ("extractive", "abstractive")
501
+
502
+ Returns:
503
+ Dict: Summary results
504
+ """
505
+ try:
506
+ # Chunk documents for better processing
507
+ chunked_docs = self.chunk_documents(documents, strategy="semantic")
508
+
509
+ if summary_type == "extractive":
510
+ # Extract key sentences/chunks
511
+ return self._create_extractive_summary(chunked_docs)
512
+ else:
513
+ # Generate abstractive summary using LLM
514
+ return self._create_abstractive_summary(chunked_docs, llm)
515
+
516
+ except Exception as e:
517
+ return {'error': f"Document summarization failed: {str(e)}"}
518
+
519
+ def _create_extractive_summary(self, chunked_docs: List[Document]) -> Dict[str, Any]:
520
+ """Create extractive summary by selecting key chunks"""
521
+ try:
522
+ # Simple extractive approach: select chunks with highest semantic density
523
+ chunk_scores = []
524
+
525
+ for doc in chunked_docs:
526
+ content = doc.page_content
527
+ # Simple scoring based on content characteristics
528
+ word_count = len(content.split())
529
+ sentence_count = len([s for s in content.split('.') if s.strip()])
530
+
531
+ # Score based on information density
532
+ density_score = word_count / max(sentence_count, 1)
533
+
534
+ # Bonus for chunks with questions, definitions, or lists
535
+ structure_bonus = 0
536
+ if '?' in content:
537
+ structure_bonus += 1
538
+ if any(word in content.lower() for word in ['define', 'definition', 'means', 'refers to']):
539
+ structure_bonus += 2
540
+ if content.count('\n•') > 0 or content.count('1.') > 0:
541
+ structure_bonus += 1
542
+
543
+ total_score = density_score + structure_bonus
544
+ chunk_scores.append((doc, total_score))
545
+
546
+ # Sort by score and select top chunks for summary
547
+ chunk_scores.sort(key=lambda x: x[1], reverse=True)
548
+ top_chunks = chunk_scores[:min(5, len(chunk_scores))]
549
+
550
+ summary_content = []
551
+ for doc, score in top_chunks:
552
+ summary_content.append({
553
+ 'content': doc.page_content,
554
+ 'score': score,
555
+ 'metadata': doc.metadata
556
+ })
557
+
558
+ return {
559
+ 'summary_type': 'extractive',
560
+ 'key_chunks': summary_content,
561
+ 'total_chunks_analyzed': len(chunked_docs),
562
+ 'chunks_selected': len(top_chunks)
563
+ }
564
+
565
+ except Exception as e:
566
+ return {'error': f"Extractive summary failed: {str(e)}"}
567
+
568
+ def _create_abstractive_summary(self, chunked_docs: List[Document], llm) -> Dict[str, Any]:
569
+ """Create abstractive summary using language model"""
570
+ try:
571
+ # Combine content from top chunks
572
+ combined_content = "\n\n".join([doc.page_content for doc in chunked_docs[:10]])
573
+
574
+ summary_prompt = f"""Please provide a comprehensive summary of the following content.
575
+ Focus on the main topics, key insights, and important details that would be valuable for AI search engines.
576
+
577
+ Content:
578
+ {combined_content[:5000]}
579
+
580
+ Summary:"""
581
+
582
+ from langchain.prompts import ChatPromptTemplate
583
+
584
+ prompt_template = ChatPromptTemplate.from_messages([
585
+ ("system", "You are a professional content summarizer. Create clear, informative summaries."),
586
+ ("user", summary_prompt)
587
+ ])
588
+
589
+ chain = prompt_template | llm
590
+ result = chain.invoke({})
591
+
592
+ summary_text = result.content if hasattr(result, 'content') else str(result)
593
+
594
+ return {
595
+ 'summary_type': 'abstractive',
596
+ 'summary': summary_text,
597
+ 'source_chunks': len(chunked_docs),
598
+ 'content_length_processed': len(combined_content)
599
+ }
600
+
601
+ except Exception as e:
602
+ return {'error': f"Abstractive summary failed: {str(e)}"}
603
+
604
+ def save_vector_store(self, vector_store, directory_path: str, store_type: str = "faiss") -> bool:
605
+ """
606
+ Save vector store to disk
607
+
608
+ Args:
609
+ vector_store: Vector store instance to save
610
+ directory_path (str): Directory to save the store
611
+ store_type (str): Type of vector store
612
+
613
+ Returns:
614
+ bool: Success status
615
+ """
616
+ try:
617
+ os.makedirs(directory_path, exist_ok=True)
618
+
619
+ if store_type.lower() == "faiss":
620
+ vector_store.save_local(directory_path)
621
+ elif store_type.lower() == "chroma":
622
+ # Chroma stores are typically persisted during creation
623
+ pass
624
+
625
+ return True
626
+
627
+ except Exception as e:
628
+ print(f"Failed to save vector store: {str(e)}")
629
+ return False
630
+
631
+ def load_vector_store(self, directory_path: str, store_type: str = "faiss"):
632
+ """
633
+ Load vector store from disk
634
+
635
+ Args:
636
+ directory_path (str): Directory containing the saved store
637
+ store_type (str): Type of vector store
638
+
639
+ Returns:
640
+ Vector store instance or None if failed
641
+ """
642
+ try:
643
+ if not os.path.exists(directory_path):
644
+ return None
645
+
646
+ if store_type.lower() == "faiss":
647
+ vector_store = FAISS.load_local(
648
+ directory_path,
649
+ self.embeddings,
650
+ allow_dangerous_deserialization=True
651
+ )
652
+ return vector_store
653
+ elif store_type.lower() == "chroma":
654
+ vector_store = Chroma(
655
+ persist_directory=directory_path,
656
+ embedding_function=self.embeddings
657
+ )
658
+ return vector_store
659
+
660
+ return None
661
+
662
+ except Exception as e:
663
+ print(f"Failed to load vector store: {str(e)}")
664
+ return None
665
+
666
+ def get_chunking_stats(self, documents: List[Document], strategy: str = "recursive") -> Dict[str, Any]:
667
+ """
668
+ Get detailed statistics about document chunking
669
+
670
+ Args:
671
+ documents (List[Document]): Documents to analyze
672
+ strategy (str): Chunking strategy to use
673
+
674
+ Returns:
675
+ Dict: Detailed chunking statistics
676
+ """
677
+ try:
678
+ # Chunk documents
679
+ chunked_docs = self.chunk_documents(documents, strategy=strategy)
680
+
681
+ # Calculate statistics
682
+ chunk_sizes = [len(doc.page_content) for doc in chunked_docs]
683
+ word_counts = [len(doc.page_content.split()) for doc in chunked_docs]
684
+
685
+ stats = {
686
+ 'strategy_used': strategy,
687
+ 'original_documents': len(documents),
688
+ 'total_chunks': len(chunked_docs),
689
+ 'chunk_size_stats': {
690
+ 'min': min(chunk_sizes) if chunk_sizes else 0,
691
+ 'max': max(chunk_sizes) if chunk_sizes else 0,
692
+ 'mean': np.mean(chunk_sizes) if chunk_sizes else 0,
693
+ 'median': np.median(chunk_sizes) if chunk_sizes else 0,
694
+ 'std': np.std(chunk_sizes) if chunk_sizes else 0
695
+ },
696
+ 'word_count_stats': {
697
+ 'min': min(word_counts) if word_counts else 0,
698
+ 'max': max(word_counts) if word_counts else 0,
699
+ 'mean': np.mean(word_counts) if word_counts else 0,
700
+ 'median': np.median(word_counts) if word_counts else 0,
701
+ 'std': np.std(word_counts) if word_counts else 0
702
+ },
703
+ 'chunk_distribution': {
704
+ 'very_small': len([s for s in chunk_sizes if s < 200]),
705
+ 'small': len([s for s in chunk_sizes if 200 <= s < 500]),
706
+ 'medium': len([s for s in chunk_sizes if 500 <= s < 1000]),
707
+ 'large': len([s for s in chunk_sizes if 1000 <= s < 2000]),
708
+ 'very_large': len([s for s in chunk_sizes if s >= 2000])
709
+ },
710
+ 'overlap_efficiency': self._calculate_overlap_efficiency(chunked_docs),
711
+ 'content_coverage': self._calculate_content_coverage(documents, chunked_docs)
712
+ }
713
+
714
+ return stats
715
+
716
+ except Exception as e:
717
+ return {'error': f"Chunking statistics failed: {str(e)}"}
718
+
719
+ def _calculate_overlap_efficiency(self, chunked_docs: List[Document]) -> float:
720
+ """Calculate efficiency of chunk overlaps"""
721
+ try:
722
+ if len(chunked_docs) < 2:
723
+ return 1.0
724
+
725
+ total_content_length = sum(len(doc.page_content) for doc in chunked_docs)
726
+ unique_content = set()
727
+
728
+ # Rough estimate of content uniqueness
729
+ for doc in chunked_docs:
730
+ words = doc.page_content.split()
731
+ for i in range(0, len(words), 10): # Sample every 10th word
732
+ unique_content.add(' '.join(words[i:i+10]))
733
+
734
+ # Efficiency as ratio of unique content to total content
735
+ efficiency = len(unique_content) * 10 / total_content_length if total_content_length > 0 else 0
736
+ return min(efficiency, 1.0)
737
+
738
+ except Exception:
739
+ return 0.5 # Default neutral efficiency
740
+
741
+ def _calculate_content_coverage(self, original_docs: List[Document],
742
+ chunked_docs: List[Document]) -> float:
743
+ """Calculate how well chunks cover original content"""
744
+ try:
745
+ original_content = ' '.join([doc.page_content for doc in original_docs])
746
+ chunked_content = ' '.join([doc.page_content for doc in chunked_docs])
747
+
748
+ # Simple coverage metric based on length
749
+ coverage = len(chunked_content) / len(original_content) if original_content else 0
750
+ return min(coverage, 1.0)
751
+
752
+ except Exception:
753
+ return 0.0
754
+
755
+
756
+ class ChunkingOptimizer:
757
+ """Helper class for optimizing chunking parameters"""
758
+
759
+ def __init__(self, embeddings_model):
760
+ self.embeddings = embeddings_model
761
+
762
+ def optimize_chunk_size(self, documents: List[Document], test_queries: List[str],
763
+ size_range: Tuple[int, int] = (200, 2000),
764
+ step_size: int = 200) -> Dict[str, Any]:
765
+ """
766
+ Find optimal chunk size for given documents and queries
767
+
768
+ Args:
769
+ documents (List[Document]): Documents to test
770
+ test_queries (List[str]): Queries for testing retrieval
771
+ size_range (Tuple[int, int]): Range of chunk sizes to test
772
+ step_size (int): Step size for testing
773
+
774
+ Returns:
775
+ Dict: Optimization results with recommended chunk size
776
+ """
777
+ try:
778
+ results = {}
779
+ min_size, max_size = size_range
780
+
781
+ for chunk_size in range(min_size, max_size + 1, step_size):
782
+ # Test this chunk size
783
+ chunker = VectorChunker(self.embeddings, chunk_size=chunk_size)
784
+
785
+ try:
786
+ chunked_docs = chunker.chunk_documents(documents)
787
+ vector_store = chunker.create_vector_store(chunked_docs)
788
+
789
+ # Test retrieval performance
790
+ retrieval_scores = []
791
+ for query in test_queries:
792
+ search_results = vector_store.similarity_search_with_score(query, k=3)
793
+ if search_results:
794
+ avg_score = sum(score for _, score in search_results) / len(search_results)
795
+ retrieval_scores.append(float(avg_score))
796
+
797
+ avg_performance = np.mean(retrieval_scores) if retrieval_scores else 0
798
+
799
+ results[chunk_size] = {
800
+ 'average_retrieval_score': avg_performance,
801
+ 'total_chunks': len(chunked_docs),
802
+ 'retrieval_scores': retrieval_scores
803
+ }
804
+
805
+ except Exception as e:
806
+ results[chunk_size] = {'error': str(e)}
807
+
808
+ # Find optimal chunk size
809
+ valid_results = {k: v for k, v in results.items() if 'error' not in v}
810
+
811
+ if valid_results:
812
+ optimal_size = max(valid_results.keys(),
813
+ key=lambda k: valid_results[k]['average_retrieval_score'])
814
+
815
+ return {
816
+ 'optimal_chunk_size': optimal_size,
817
+ 'optimal_performance': valid_results[optimal_size]['average_retrieval_score'],
818
+ 'all_results': results,
819
+ 'performance_trend': self._analyze_performance_trend(valid_results),
820
+ 'recommendation': f"Use chunk size {optimal_size} for best retrieval performance"
821
+ }
822
+ else:
823
+ return {
824
+ 'error': 'No valid chunk sizes could be tested',
825
+ 'all_results': results
826
+ }
827
+
828
+ except Exception as e:
829
+ return {'error': f"Chunk size optimization failed: {str(e)}"}
830
+
831
+ def _analyze_performance_trend(self, results: Dict[int, Dict[str, Any]]) -> Dict[str, Any]:
832
+ """Analyze performance trend across different chunk sizes"""
833
+ try:
834
+ sizes = sorted(results.keys())
835
+ performances = [results[size]['average_retrieval_score'] for size in sizes]
836
+
837
+ # Find trend direction
838
+ if len(performances) >= 2:
839
+ trend_direction = "increasing" if performances[-1] > performances[0] else "decreasing"
840
+ peak_performance = max(performances)
841
+ peak_size = sizes[performances.index(peak_performance)]
842
+
843
+ return {
844
+ 'trend_direction': trend_direction,
845
+ 'peak_performance': peak_performance,
846
+ 'peak_size': peak_size,
847
+ 'performance_range': max(performances) - min(performances),
848
+ 'stable_performance': max(performances) - min(performances) < 0.1
849
+ }
850
+ else:
851
+ return {'error': 'Insufficient data for trend analysis'}
852
+
853
+ except Exception:
854
+ return {'error': 'Trend analysis failed'}
855
+
856
+
857
+ class RAGPipeline:
858
+ """Complete RAG pipeline for document question-answering"""
859
+
860
+ def __init__(self, embeddings_model, llm):
861
+ self.embeddings = embeddings_model
862
+ self.llm = llm
863
+ self.chunker = VectorChunker(embeddings_model)
864
+ self.vector_stores = {}
865
+ self.qa_chains = {}
866
+
867
+ def create_pipeline(self, documents: List[Document], pipeline_id: str,
868
+ chunking_strategy: str = "semantic") -> Dict[str, Any]:
869
+ """
870
+ Create a complete RAG pipeline for documents
871
+
872
+ Args:
873
+ documents (List[Document]): Documents to process
874
+ pipeline_id (str): Unique identifier for this pipeline
875
+ chunking_strategy (str): Strategy for document chunking
876
+
877
+ Returns:
878
+ Dict: Pipeline creation results
879
+ """
880
+ try:
881
+ # Step 1: Chunk documents
882
+ chunked_docs = self.chunker.chunk_documents(documents, strategy=chunking_strategy)
883
+
884
+ # Step 2: Create vector store
885
+ vector_store = self.chunker.create_vector_store(chunked_docs, store_type="faiss")
886
+
887
+ # Step 3: Create QA chain
888
+ qa_chain = self.chunker.create_qa_chain(documents, self.llm)
889
+
890
+ # Store pipeline components
891
+ self.vector_stores[pipeline_id] = vector_store
892
+ self.qa_chains[pipeline_id] = qa_chain
893
+
894
+ # Pipeline statistics
895
+ stats = {
896
+ 'pipeline_id': pipeline_id,
897
+ 'documents_processed': len(documents),
898
+ 'chunks_created': len(chunked_docs),
899
+ 'chunking_strategy': chunking_strategy,
900
+ 'vector_store_type': 'faiss',
901
+ 'embedding_model': str(self.embeddings),
902
+ 'created_at': self._get_timestamp()
903
+ }
904
+
905
+ return {
906
+ 'success': True,
907
+ 'pipeline_stats': stats,
908
+ 'chunking_info': self.chunker.get_chunking_stats(documents, chunking_strategy)
909
+ }
910
+
911
+ except Exception as e:
912
+ return {'error': f"Pipeline creation failed: {str(e)}"}
913
+
914
+ def query_pipeline(self, pipeline_id: str, query: str,
915
+ return_sources: bool = True) -> Dict[str, Any]:
916
+ """
917
+ Query a created RAG pipeline
918
+
919
+ Args:
920
+ pipeline_id (str): ID of the pipeline to query
921
+ query (str): Question to ask
922
+ return_sources (bool): Whether to return source documents
923
+
924
+ Returns:
925
+ Dict: Query results with answer and sources
926
+ """
927
+ try:
928
+ if pipeline_id not in self.qa_chains:
929
+ return {'error': f"Pipeline '{pipeline_id}' not found"}
930
+
931
+ qa_chain = self.qa_chains[pipeline_id]
932
+
933
+ # Execute query
934
+ result = qa_chain({"query": query})
935
+
936
+ # Format response
937
+ response = {
938
+ 'query': query,
939
+ 'answer': result.get('result', 'No answer generated'),
940
+ 'pipeline_id': pipeline_id,
941
+ 'query_timestamp': self._get_timestamp()
942
+ }
943
+
944
+ # Add source documents if requested
945
+ if return_sources and 'source_documents' in result:
946
+ sources = []
947
+ for i, doc in enumerate(result['source_documents']):
948
+ source = {
949
+ 'source_index': i,
950
+ 'content': doc.page_content,
951
+ 'metadata': doc.metadata,
952
+ 'relevance_rank': i + 1
953
+ }
954
+ sources.append(source)
955
+
956
+ response['sources'] = sources
957
+ response['num_sources'] = len(sources)
958
+
959
+ return response
960
+
961
+ except Exception as e:
962
+ return {'error': f"Pipeline query failed: {str(e)}"}
963
+
964
+ def batch_query_pipeline(self, pipeline_id: str, queries: List[str]) -> List[Dict[str, Any]]:
965
+ """
966
+ Execute multiple queries on a pipeline
967
+
968
+ Args:
969
+ pipeline_id (str): ID of the pipeline to query
970
+ queries (List[str]): List of questions to ask
971
+
972
+ Returns:
973
+ List[Dict]: List of query results
974
+ """
975
+ results = []
976
+
977
+ for i, query in enumerate(queries):
978
+ try:
979
+ result = self.query_pipeline(pipeline_id, query, return_sources=False)
980
+ result['batch_index'] = i
981
+ results.append(result)
982
+
983
+ except Exception as e:
984
+ results.append({
985
+ 'batch_index': i,
986
+ 'query': query,
987
+ 'error': f"Batch query failed: {str(e)}"
988
+ })
989
+
990
+ return results
991
+
992
+ def evaluate_pipeline(self, pipeline_id: str, test_queries: List[str],
993
+ expected_answers: List[str] = None) -> Dict[str, Any]:
994
+ """
995
+ Evaluate pipeline performance on test queries
996
+
997
+ Args:
998
+ pipeline_id (str): ID of the pipeline to evaluate
999
+ test_queries (List[str]): Test questions
1000
+ expected_answers (List[str]): Optional expected answers for comparison
1001
+
1002
+ Returns:
1003
+ Dict: Evaluation results
1004
+ """
1005
+ try:
1006
+ if pipeline_id not in self.qa_chains:
1007
+ return {'error': f"Pipeline '{pipeline_id}' not found"}
1008
+
1009
+ evaluation_results = []
1010
+ response_times = []
1011
+
1012
+ for i, query in enumerate(test_queries):
1013
+ import time
1014
+ start_time = time.time()
1015
+
1016
+ # Execute query
1017
+ result = self.query_pipeline(pipeline_id, query, return_sources=True)
1018
+
1019
+ end_time = time.time()
1020
+ response_time = end_time - start_time
1021
+ response_times.append(response_time)
1022
+
1023
+ # Evaluate result
1024
+ eval_result = {
1025
+ 'query_index': i,
1026
+ 'query': query,
1027
+ 'answer_generated': not result.get('error'),
1028
+ 'response_time': response_time,
1029
+ 'answer_length': len(result.get('answer', '')),
1030
+ 'sources_returned': result.get('num_sources', 0)
1031
+ }
1032
+
1033
+ # If expected answer provided, calculate similarity
1034
+ if expected_answers and i < len(expected_answers):
1035
+ expected = expected_answers[i]
1036
+ generated = result.get('answer', '')
1037
+
1038
+ # Simple similarity metric
1039
+ similarity = self._calculate_answer_similarity(expected, generated)
1040
+ eval_result['answer_similarity'] = similarity
1041
+ eval_result['expected_answer'] = expected
1042
+
1043
+ evaluation_results.append(eval_result)
1044
+
1045
+ # Calculate aggregate metrics
1046
+ successful_queries = len([r for r in evaluation_results if r['answer_generated']])
1047
+ avg_response_time = np.mean(response_times) if response_times else 0
1048
+
1049
+ if expected_answers:
1050
+ similarities = [r.get('answer_similarity', 0) for r in evaluation_results
1051
+ if 'answer_similarity' in r]
1052
+ avg_similarity = np.mean(similarities) if similarities else 0
1053
+ else:
1054
+ avg_similarity = None
1055
+
1056
+ return {
1057
+ 'pipeline_id': pipeline_id,
1058
+ 'total_queries': len(test_queries),
1059
+ 'successful_queries': successful_queries,
1060
+ 'success_rate': successful_queries / len(test_queries) if test_queries else 0,
1061
+ 'average_response_time': avg_response_time,
1062
+ 'average_answer_similarity': avg_similarity,
1063
+ 'detailed_results': evaluation_results,
1064
+ 'evaluation_timestamp': self._get_timestamp()
1065
+ }
1066
+
1067
+ except Exception as e:
1068
+ return {'error': f"Pipeline evaluation failed: {str(e)}"}
1069
+
1070
+ def _calculate_answer_similarity(self, expected: str, generated: str) -> float:
1071
+ """Calculate similarity between expected and generated answers"""
1072
+ try:
1073
+ # Simple word overlap similarity
1074
+ expected_words = set(expected.lower().split())
1075
+ generated_words = set(generated.lower().split())
1076
+
1077
+ if not expected_words and not generated_words:
1078
+ return 1.0
1079
+
1080
+ intersection = expected_words.intersection(generated_words)
1081
+ union = expected_words.union(generated_words)
1082
+
1083
+ return len(intersection) / len(union) if union else 0.0
1084
+
1085
+ except Exception:
1086
+ return 0.0
1087
+
1088
+ def get_pipeline_info(self, pipeline_id: str) -> Dict[str, Any]:
1089
+ """Get information about a specific pipeline"""
1090
+ try:
1091
+ if pipeline_id not in self.qa_chains:
1092
+ return {'error': f"Pipeline '{pipeline_id}' not found"}
1093
+
1094
+ # Get vector store info
1095
+ vector_store = self.vector_stores.get(pipeline_id)
1096
+ if vector_store:
1097
+ try:
1098
+ # Try to get vector store statistics
1099
+ total_vectors = vector_store.index.ntotal if hasattr(vector_store, 'index') else 'unknown'
1100
+ except:
1101
+ total_vectors = 'unknown'
1102
+ else:
1103
+ total_vectors = 'unknown'
1104
+
1105
+ return {
1106
+ 'pipeline_id': pipeline_id,
1107
+ 'has_qa_chain': pipeline_id in self.qa_chains,
1108
+ 'has_vector_store': pipeline_id in self.vector_stores,
1109
+ 'total_vectors': total_vectors,
1110
+ 'embedding_model': str(self.embeddings),
1111
+ 'llm_model': str(self.llm)
1112
+ }
1113
+
1114
+ except Exception as e:
1115
+ return {'error': f"Failed to get pipeline info: {str(e)}"}
1116
+
1117
+ def list_pipelines(self) -> Dict[str, Any]:
1118
+ """List all created pipelines"""
1119
+ return {
1120
+ 'total_pipelines': len(self.qa_chains),
1121
+ 'pipeline_ids': list(self.qa_chains.keys()),
1122
+ 'vector_stores': list(self.vector_stores.keys())
1123
+ }
1124
+
1125
+ def delete_pipeline(self, pipeline_id: str) -> Dict[str, Any]:
1126
+ """Delete a pipeline and free resources"""
1127
+ try:
1128
+ deleted_components = []
1129
+
1130
+ if pipeline_id in self.qa_chains:
1131
+ del self.qa_chains[pipeline_id]
1132
+ deleted_components.append('qa_chain')
1133
+
1134
+ if pipeline_id in self.vector_stores:
1135
+ del self.vector_stores[pipeline_id]
1136
+ deleted_components.append('vector_store')
1137
+
1138
+ if deleted_components:
1139
+ return {
1140
+ 'success': True,
1141
+ 'pipeline_id': pipeline_id,
1142
+ 'deleted_components': deleted_components
1143
+ }
1144
+ else:
1145
+ return {'error': f"Pipeline '{pipeline_id}' not found"}
1146
+
1147
+ except Exception as e:
1148
+ return {'error': f"Pipeline deletion failed: {str(e)}"}
1149
+
1150
+ def export_pipeline_config(self, pipeline_id: str) -> Dict[str, Any]:
1151
+ """Export pipeline configuration for recreation"""
1152
+ try:
1153
+ if pipeline_id not in self.qa_chains:
1154
+ return {'error': f"Pipeline '{pipeline_id}' not found"}
1155
+
1156
+ config = {
1157
+ 'pipeline_id': pipeline_id,
1158
+ 'embedding_model_name': getattr(self.embeddings, 'model_name', 'unknown'),
1159
+ 'llm_model_name': getattr(self.llm, 'model_name', 'unknown'),
1160
+ 'chunker_config': {
1161
+ 'chunk_size': self.chunker.chunk_size,
1162
+ 'chunk_overlap': self.chunker.chunk_overlap
1163
+ },
1164
+ 'export_timestamp': self._get_timestamp(),
1165
+ 'vector_store_type': 'faiss'
1166
+ }
1167
+
1168
+ return config
1169
+
1170
+ except Exception as e:
1171
+ return {'error': f"Pipeline export failed: {str(e)}"}
1172
+
1173
+ def _get_timestamp(self) -> str:
1174
+ """Get current timestamp"""
1175
+ from datetime import datetime
1176
+ return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
1177
+
1178
+
1179
+ # Utility functions for the module
1180
+
1181
+ def optimize_rag_pipeline(documents: List[Document], embeddings_model, llm,
1182
+ test_queries: List[str]) -> Dict[str, Any]:
1183
+ """
1184
+ Optimize RAG pipeline configuration for given documents and queries
1185
+
1186
+ Args:
1187
+ documents (List[Document]): Documents to optimize for
1188
+ embeddings_model: Embedding model to use
1189
+ llm: Language model to use
1190
+ test_queries (List[str]): Test queries for optimization
1191
+
1192
+ Returns:
1193
+ Dict: Optimization recommendations
1194
+ """
1195
+ try:
1196
+ # Test different chunking strategies
1197
+ chunker = VectorChunker(embeddings_model)
1198
+ chunking_results = chunker.optimize_chunking_strategy(documents, test_queries)
1199
+
1200
+ # Test different chunk sizes
1201
+ optimizer = ChunkingOptimizer(embeddings_model)
1202
+ size_results = optimizer.optimize_chunk_size(documents, test_queries)
1203
+
1204
+ # Create optimized pipeline
1205
+ best_strategy = chunking_results.get('recommended_strategy', 'semantic')
1206
+ best_size = size_results.get('optimal_chunk_size', 1000)
1207
+
1208
+ # Create optimized chunker
1209
+ optimized_chunker = VectorChunker(
1210
+ embeddings_model,
1211
+ chunk_size=best_size,
1212
+ chunk_overlap=best_size // 5 # 20% overlap
1213
+ )
1214
+
1215
+ # Test the optimized configuration
1216
+ pipeline = RAGPipeline(embeddings_model, llm)
1217
+ pipeline.chunker = optimized_chunker
1218
+
1219
+ test_pipeline_id = "optimization_test"
1220
+ creation_result = pipeline.create_pipeline(documents, test_pipeline_id, best_strategy)
1221
+
1222
+ if not creation_result.get('error'):
1223
+ evaluation_result = pipeline.evaluate_pipeline(test_pipeline_id, test_queries)
1224
+ pipeline.delete_pipeline(test_pipeline_id) # Clean up
1225
+ else:
1226
+ evaluation_result = {'error': 'Could not evaluate optimized pipeline'}
1227
+
1228
+ return {
1229
+ 'optimization_complete': True,
1230
+ 'recommended_config': {
1231
+ 'chunking_strategy': best_strategy,
1232
+ 'chunk_size': best_size,
1233
+ 'chunk_overlap': best_size // 5
1234
+ },
1235
+ 'chunking_optimization': chunking_results,
1236
+ 'size_optimization': size_results,
1237
+ 'performance_evaluation': evaluation_result,
1238
+ 'recommendations': [
1239
+ f"Use {best_strategy} chunking strategy",
1240
+ f"Set chunk size to {best_size} characters",
1241
+ f"Use {best_size // 5} character overlap",
1242
+ "Monitor and adjust based on query performance"
1243
+ ]
1244
+ }
1245
+
1246
+ except Exception as e:
1247
+ return {'error': f"RAG optimization failed: {str(e)}"}
1248
+
1249
+
1250
+ def create_demo_rag_system(sample_documents: List[Document], embeddings_model, llm) -> Dict[str, Any]:
1251
+ """
1252
+ Create a demonstration RAG system with sample documents
1253
+
1254
+ Args:
1255
+ sample_documents (List[Document]): Sample documents for demo
1256
+ embeddings_model: Embedding model
1257
+ llm: Language model
1258
+
1259
+ Returns:
1260
+ Dict: Demo system information and sample interactions
1261
+ """
1262
+ try:
1263
+ # Create RAG pipeline
1264
+ pipeline = RAGPipeline(embeddings_model, llm)
1265
+ demo_id = "demo_system"
1266
+
1267
+ # Create the pipeline
1268
+ creation_result = pipeline.create_pipeline(sample_documents, demo_id, "semantic")
1269
+
1270
+ if creation_result.get('error'):
1271
+ return {'error': f"Demo system creation failed: {creation_result['error']}"}
1272
+
1273
+ # Sample queries for demonstration
1274
+ demo_queries = [
1275
+ "What is the main topic of these documents?",
1276
+ "Can you summarize the key points?",
1277
+ "What are the most important concepts mentioned?"
1278
+ ]
1279
+
1280
+ # Execute demo queries
1281
+ demo_results = []
1282
+ for query in demo_queries:
1283
+ result = pipeline.query_pipeline(demo_id, query, return_sources=True)
1284
+ demo_results.append(result)
1285
+
1286
+ # Get system statistics
1287
+ pipeline_info = pipeline.get_pipeline_info(demo_id)
1288
+
1289
+ return {
1290
+ 'demo_system_created': True,
1291
+ 'pipeline_id': demo_id,
1292
+ 'creation_stats': creation_result,
1293
+ 'pipeline_info': pipeline_info,
1294
+ 'demo_queries': demo_queries,
1295
+ 'demo_results': demo_results,
1296
+ 'usage_instructions': [
1297
+ f"Use pipeline.query_pipeline('{demo_id}', 'your question') to ask questions",
1298
+ "The system will return answers with source document references",
1299
+ "Sources show which parts of the documents were used for the answer"
1300
+ ]
1301
+ }
1302
+
1303
+ except Exception as e:
1304
+ return {'error': f"Demo system creation failed: {str(e)}"}
1305
+
1306
+
1307
+ # Export the main classes for use in other modules
1308
+ __all__ = [
1309
+ 'VectorChunker',
1310
+ 'ChunkingOptimizer',
1311
+ 'RAGPipeline',
1312
+ 'optimize_rag_pipeline',
1313
+ 'create_demo_rag_system'
1314
+ ]
utils/export.py ADDED
@@ -0,0 +1,1896 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Results Export and Reporting Module
3
+ Handles export of analysis results, reports, and data for external use
4
+ """
5
+
6
+ import json
7
+ import csv
8
+ import io
9
+ import zipfile
10
+ import tempfile
11
+ import os
12
+ from datetime import datetime
13
+ from typing import Dict, Any, List, Optional, Union
14
+ import pandas as pd
15
+ from dataclasses import dataclass, asdict
16
+
17
+
18
+ @dataclass
19
+ class GEOReport:
20
+ """Data class for GEO analysis reports"""
21
+ website_url: str
22
+ analysis_date: str
23
+ overall_score: float
24
+ pages_analyzed: int
25
+ geo_scores: Dict[str, float]
26
+ recommendations: List[str]
27
+ optimization_opportunities: List[Dict[str, Any]]
28
+ competitive_position: str
29
+
30
+ def to_dict(self) -> Dict[str, Any]:
31
+ """Convert report to dictionary"""
32
+ return asdict(self)
33
+
34
+
35
+ @dataclass
36
+ class ContentAnalysis:
37
+ """Data class for content optimization analysis"""
38
+ original_content: str
39
+ analysis_date: str
40
+ clarity_score: float
41
+ structure_score: float
42
+ answerability_score: float
43
+ keywords: List[str]
44
+ optimized_content: Optional[str]
45
+ improvements_made: List[str]
46
+
47
+ def to_dict(self) -> Dict[str, Any]:
48
+ """Convert analysis to dictionary"""
49
+ return asdict(self)
50
+
51
+
52
+ class ResultExporter:
53
+ """Main class for exporting analysis results and generating reports"""
54
+
55
+ def __init__(self):
56
+ self.export_formats = ['json', 'csv', 'html', 'pdf', 'xlsx']
57
+ self.supported_types = ['geo_analysis', 'content_optimization', 'qa_results', 'batch_analysis']
58
+
59
+ def export_geo_results(self, geo_results: List[Dict[str, Any]],
60
+ website_url: str, format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
61
+ """
62
+ Export GEO analysis results in specified format
63
+
64
+ Args:
65
+ geo_results (List[Dict]): List of GEO analysis results
66
+ website_url (str): URL of analyzed website
67
+ format_type (str): Export format ('json', 'csv', 'html', 'xlsx')
68
+
69
+ Returns:
70
+ Union[str, bytes, Dict]: Exported data in requested format
71
+ """
72
+ try:
73
+ # Prepare consolidated data
74
+ export_data = self._prepare_geo_export_data(geo_results, website_url)
75
+
76
+ if format_type.lower() == 'json':
77
+ return self._export_geo_json(export_data)
78
+ elif format_type.lower() == 'csv':
79
+ return self._export_geo_csv(export_data)
80
+ elif format_type.lower() == 'html':
81
+ return self._export_geo_html(export_data)
82
+ elif format_type.lower() == 'xlsx':
83
+ return self._export_geo_excel(export_data)
84
+ elif format_type.lower() == 'pdf':
85
+ return self._export_geo_pdf(export_data)
86
+ else:
87
+ raise ValueError(f"Unsupported export format: {format_type}")
88
+
89
+ except Exception as e:
90
+ return {'error': f"Export failed: {str(e)}"}
91
+
92
+ def export_enhancement_results(self, enhancement_result: Dict[str, Any],
93
+ format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
94
+ """
95
+ Export content enhancement results
96
+
97
+ Args:
98
+ enhancement_result (Dict): Content enhancement analysis result
99
+ format_type (str): Export format
100
+
101
+ Returns:
102
+ Union[str, bytes, Dict]: Exported data
103
+ """
104
+ try:
105
+ # Prepare data for export
106
+ export_data = self._prepare_enhancement_export_data(enhancement_result)
107
+
108
+ if format_type.lower() == 'json':
109
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
110
+ elif format_type.lower() == 'html':
111
+ return self._export_enhancement_html(export_data)
112
+ elif format_type.lower() == 'csv':
113
+ return self._export_enhancement_csv(export_data)
114
+ else:
115
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
116
+
117
+ except Exception as e:
118
+ return {'error': f"Enhancement export failed: {str(e)}"}
119
+
120
+ def export_qa_results(self, qa_results: List[Dict[str, Any]],
121
+ format_type: str = 'json') -> Union[str, bytes, Dict[str, Any]]:
122
+ """
123
+ Export Q&A session results
124
+
125
+ Args:
126
+ qa_results (List[Dict]): List of Q&A interactions
127
+ format_type (str): Export format
128
+
129
+ Returns:
130
+ Union[str, bytes, Dict]: Exported data
131
+ """
132
+ try:
133
+ export_data = {
134
+ 'qa_session': {
135
+ 'session_date': datetime.now().isoformat(),
136
+ 'total_questions': len(qa_results),
137
+ 'interactions': qa_results
138
+ },
139
+ 'summary': {
140
+ 'successful_answers': len([r for r in qa_results if not r.get('error')]),
141
+ 'average_response_length': self._calculate_avg_response_length(qa_results),
142
+ 'most_common_topics': self._extract_common_topics(qa_results)
143
+ }
144
+ }
145
+
146
+ if format_type.lower() == 'json':
147
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
148
+ elif format_type.lower() == 'html':
149
+ return self._export_qa_html(export_data)
150
+ elif format_type.lower() == 'csv':
151
+ return self._export_qa_csv(export_data)
152
+ else:
153
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
154
+
155
+ except Exception as e:
156
+ return {'error': f"Q&A export failed: {str(e)}"}
157
+
158
+ def create_comprehensive_report(self, analysis_data: Dict[str, Any],
159
+ report_type: str = 'full') -> Dict[str, Any]:
160
+ """
161
+ Create comprehensive analysis report
162
+
163
+ Args:
164
+ analysis_data (Dict): Combined analysis data from multiple sources
165
+ report_type (str): Type of report ('full', 'summary', 'executive')
166
+
167
+ Returns:
168
+ Dict: Comprehensive report data
169
+ """
170
+ try:
171
+ report = {
172
+ 'report_metadata': {
173
+ 'generated_at': datetime.now().isoformat(),
174
+ 'report_type': report_type,
175
+ 'generator': 'GEO SEO AI Optimizer',
176
+ 'version': '1.0'
177
+ }
178
+ }
179
+
180
+ if report_type == 'executive':
181
+ report.update(self._create_executive_summary(analysis_data))
182
+ elif report_type == 'summary':
183
+ report.update(self._create_summary_report(analysis_data))
184
+ else: # full report
185
+ report.update(self._create_full_report(analysis_data))
186
+
187
+ return report
188
+
189
+ except Exception as e:
190
+ return {'error': f"Report creation failed: {str(e)}"}
191
+
192
+ def export_batch_results(self, batch_results: List[Dict[str, Any]],
193
+ batch_metadata: Dict[str, Any],
194
+ format_type: str = 'xlsx') -> Union[str, bytes, Dict[str, Any]]:
195
+ """
196
+ Export batch analysis results
197
+
198
+ Args:
199
+ batch_results (List[Dict]): List of batch analysis results
200
+ batch_metadata (Dict): Metadata about the batch process
201
+ format_type (str): Export format
202
+
203
+ Returns:
204
+ Union[str, bytes, Dict]: Exported batch data
205
+ """
206
+ try:
207
+ export_data = {
208
+ 'batch_metadata': batch_metadata,
209
+ 'batch_results': batch_results,
210
+ 'batch_summary': self._create_batch_summary(batch_results),
211
+ 'export_timestamp': datetime.now().isoformat()
212
+ }
213
+
214
+ if format_type.lower() == 'xlsx':
215
+ return self._export_batch_excel(export_data)
216
+ elif format_type.lower() == 'json':
217
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
218
+ elif format_type.lower() == 'csv':
219
+ return self._export_batch_csv(export_data)
220
+ else:
221
+ return json.dumps(export_data, indent=2, ensure_ascii=False)
222
+
223
+ except Exception as e:
224
+ return {'error': f"Batch export failed: {str(e)}"}
225
+
226
+ def create_export_package(self, analysis_data: Dict[str, Any],
227
+ package_name: str = "geo_analysis") -> bytes:
228
+ """
229
+ Create a ZIP package with multiple export formats
230
+
231
+ Args:
232
+ analysis_data (Dict): Analysis data to package
233
+ package_name (str): Name for the package
234
+
235
+ Returns:
236
+ bytes: ZIP file content
237
+ """
238
+ try:
239
+ # Create temporary directory
240
+ with tempfile.TemporaryDirectory() as temp_dir:
241
+ zip_path = os.path.join(temp_dir, f"{package_name}.zip")
242
+
243
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
244
+ # Add JSON export
245
+ json_data = json.dumps(analysis_data, indent=2, ensure_ascii=False)
246
+ zip_file.writestr(f"{package_name}.json", json_data)
247
+
248
+ # Add HTML report
249
+ if 'geo_results' in analysis_data:
250
+ html_data = self._export_geo_html(analysis_data)
251
+ zip_file.writestr(f"{package_name}_report.html", html_data)
252
+
253
+ # Add CSV data
254
+ if 'geo_results' in analysis_data:
255
+ csv_data = self._export_geo_csv(analysis_data)
256
+ zip_file.writestr(f"{package_name}_data.csv", csv_data)
257
+
258
+ # Add README
259
+ readme_content = self._generate_package_readme(analysis_data)
260
+ zip_file.writestr("README.txt", readme_content)
261
+
262
+ # Read the ZIP file
263
+ with open(zip_path, 'rb') as zip_file:
264
+ return zip_file.read()
265
+
266
+ except Exception as e:
267
+ raise Exception(f"Package creation failed: {str(e)}")
268
+
269
+ def _prepare_geo_export_data(self, geo_results: List[Dict[str, Any]], website_url: str) -> Dict[str, Any]:
270
+ """Prepare GEO data for export"""
271
+ try:
272
+ # Calculate aggregate metrics
273
+ valid_results = [r for r in geo_results if 'geo_scores' in r and not r.get('error')]
274
+
275
+ if not valid_results:
276
+ return {
277
+ 'error': 'No valid GEO results to export',
278
+ 'website_url': website_url,
279
+ 'export_timestamp': datetime.now().isoformat()
280
+ }
281
+
282
+ # Aggregate scores
283
+ all_scores = {}
284
+ for result in valid_results:
285
+ for metric, score in result.get('geo_scores', {}).items():
286
+ if metric not in all_scores:
287
+ all_scores[metric] = []
288
+ all_scores[metric].append(score)
289
+
290
+ avg_scores = {metric: sum(scores) / len(scores) for metric, scores in all_scores.items()}
291
+ overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
292
+
293
+ # Collect recommendations
294
+ all_recommendations = []
295
+ all_opportunities = []
296
+
297
+ for result in valid_results:
298
+ all_recommendations.extend(result.get('recommendations', []))
299
+ all_opportunities.extend(result.get('optimization_opportunities', []))
300
+
301
+ # Remove duplicates
302
+ unique_recommendations = list(set(all_recommendations))
303
+
304
+ return {
305
+ 'website_analysis': {
306
+ 'url': website_url,
307
+ 'analysis_date': datetime.now().isoformat(),
308
+ 'pages_analyzed': len(valid_results),
309
+ 'overall_geo_score': round(overall_avg, 2)
310
+ },
311
+ 'aggregate_scores': avg_scores,
312
+ 'individual_page_results': valid_results,
313
+ 'recommendations': unique_recommendations[:10], # Top 10
314
+ 'optimization_opportunities': all_opportunities,
315
+ 'performance_insights': self._generate_performance_insights(avg_scores, overall_avg),
316
+ 'export_metadata': {
317
+ 'exported_by': 'GEO SEO AI Optimizer',
318
+ 'export_timestamp': datetime.now().isoformat(),
319
+ 'data_format': 'GEO Analysis Results v1.0'
320
+ }
321
+ }
322
+
323
+ except Exception as e:
324
+ return {'error': f"Data preparation failed: {str(e)}"}
325
+
326
+ def _prepare_enhancement_export_data(self, enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
327
+ """Prepare content enhancement data for export"""
328
+ try:
329
+ scores = enhancement_result.get('scores', {})
330
+
331
+ return {
332
+ 'content_analysis': {
333
+ 'analysis_date': datetime.now().isoformat(),
334
+ 'original_content_length': enhancement_result.get('original_length', 0),
335
+ 'original_word_count': enhancement_result.get('original_word_count', 0),
336
+ 'analysis_type': enhancement_result.get('optimization_type', 'standard')
337
+ },
338
+ 'performance_scores': {
339
+ 'clarity': scores.get('clarity', 0),
340
+ 'structure': scores.get('structuredness', 0),
341
+ 'answerability': scores.get('answerability', 0),
342
+ 'overall_average': sum(scores.values()) / len(scores) if scores else 0
343
+ },
344
+ 'optimization_results': {
345
+ 'keywords_identified': enhancement_result.get('keywords', []),
346
+ 'optimized_content': enhancement_result.get('optimized_text', ''),
347
+ 'improvements_made': enhancement_result.get('optimization_suggestions', []),
348
+ 'analyze_only': enhancement_result.get('analyze_only', False)
349
+ },
350
+ 'export_metadata': {
351
+ 'exported_by': 'GEO SEO AI Optimizer',
352
+ 'export_timestamp': datetime.now().isoformat(),
353
+ 'data_format': 'Content Enhancement Results v1.0'
354
+ }
355
+ }
356
+
357
+ except Exception as e:
358
+ return {'error': f"Enhancement data preparation failed: {str(e)}"}
359
+
360
+ def _export_geo_json(self, data: Dict[str, Any]) -> str:
361
+ """Export GEO data as JSON"""
362
+ return json.dumps(data, indent=2, ensure_ascii=False)
363
+
364
+ def _export_geo_csv(self, data: Dict[str, Any]) -> str:
365
+ """Export GEO data as CSV"""
366
+ try:
367
+ output = io.StringIO()
368
+
369
+ # Write aggregate scores
370
+ writer = csv.writer(output)
371
+ writer.writerow(['GEO Analysis Results'])
372
+ writer.writerow(['Website:', data.get('website_analysis', {}).get('url', 'Unknown')])
373
+ writer.writerow(['Analysis Date:', data.get('website_analysis', {}).get('analysis_date', 'Unknown')])
374
+ writer.writerow(['Overall Score:', data.get('website_analysis', {}).get('overall_geo_score', 0)])
375
+ writer.writerow([])
376
+
377
+ # Write aggregate scores
378
+ writer.writerow(['Metric', 'Score'])
379
+ for metric, score in data.get('aggregate_scores', {}).items():
380
+ writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
381
+
382
+ writer.writerow([])
383
+ writer.writerow(['Recommendations'])
384
+ for i, rec in enumerate(data.get('recommendations', []), 1):
385
+ writer.writerow([f"{i}.", rec])
386
+
387
+ # Individual page results
388
+ if data.get('individual_page_results'):
389
+ writer.writerow([])
390
+ writer.writerow(['Individual Page Results'])
391
+
392
+ # Header for page results
393
+ first_result = data['individual_page_results'][0]
394
+ if 'geo_scores' in first_result:
395
+ headers = ['Page Index', 'Page URL', 'Page Title'] + list(first_result['geo_scores'].keys())
396
+ writer.writerow(headers)
397
+
398
+ for i, result in enumerate(data['individual_page_results']):
399
+ page_data = result.get('page_data', {})
400
+ scores = result.get('geo_scores', {})
401
+
402
+ row = [
403
+ i + 1,
404
+ page_data.get('url', 'Unknown'),
405
+ page_data.get('title', 'Unknown')
406
+ ] + [round(scores.get(metric, 0), 2) for metric in headers[3:]]
407
+
408
+ writer.writerow(row)
409
+
410
+ return output.getvalue()
411
+
412
+ except Exception as e:
413
+ return f"CSV export error: {str(e)}"
414
+
415
+ def _export_geo_html(self, data: Dict[str, Any]) -> str:
416
+ """Export GEO data as HTML report"""
417
+ try:
418
+ website_info = data.get('website_analysis', {})
419
+ scores = data.get('aggregate_scores', {})
420
+ recommendations = data.get('recommendations', [])
421
+
422
+ html_content = f"""
423
+ <!DOCTYPE html>
424
+ <html lang="en">
425
+ <head>
426
+ <meta charset="UTF-8">
427
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
428
+ <title>GEO Analysis Report - {website_info.get('url', 'Website')}</title>
429
+ <style>
430
+ body {{
431
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
432
+ line-height: 1.6;
433
+ color: #333;
434
+ max-width: 1200px;
435
+ margin: 0 auto;
436
+ padding: 20px;
437
+ background-color: #f5f5f5;
438
+ }}
439
+ .header {{
440
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
441
+ color: white;
442
+ padding: 30px;
443
+ border-radius: 10px;
444
+ margin-bottom: 30px;
445
+ text-align: center;
446
+ }}
447
+ .header h1 {{
448
+ margin: 0;
449
+ font-size: 2.5em;
450
+ }}
451
+ .summary-cards {{
452
+ display: grid;
453
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
454
+ gap: 20px;
455
+ margin-bottom: 30px;
456
+ }}
457
+ .card {{
458
+ background: white;
459
+ padding: 20px;
460
+ border-radius: 10px;
461
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
462
+ text-align: center;
463
+ }}
464
+ .card h3 {{
465
+ margin-top: 0;
466
+ color: #667eea;
467
+ }}
468
+ .score {{
469
+ font-size: 2em;
470
+ font-weight: bold;
471
+ color: #333;
472
+ }}
473
+ .scores-grid {{
474
+ display: grid;
475
+ grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
476
+ gap: 20px;
477
+ margin-bottom: 30px;
478
+ }}
479
+ .score-item {{
480
+ background: white;
481
+ padding: 15px;
482
+ border-radius: 8px;
483
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
484
+ display: flex;
485
+ justify-content: space-between;
486
+ align-items: center;
487
+ }}
488
+ .score-bar {{
489
+ width: 100px;
490
+ height: 10px;
491
+ background: #e0e0e0;
492
+ border-radius: 5px;
493
+ overflow: hidden;
494
+ }}
495
+ .score-fill {{
496
+ height: 100%;
497
+ background: linear-gradient(90deg, #ff6b6b, #ffa500, #4ecdc4);
498
+ transition: width 0.3s ease;
499
+ }}
500
+ .recommendations {{
501
+ background: white;
502
+ padding: 30px;
503
+ border-radius: 10px;
504
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
505
+ margin-bottom: 30px;
506
+ }}
507
+ .recommendations h2 {{
508
+ color: #667eea;
509
+ border-bottom: 2px solid #667eea;
510
+ padding-bottom: 10px;
511
+ }}
512
+ .rec-item {{
513
+ padding: 10px 0;
514
+ border-bottom: 1px solid #eee;
515
+ }}
516
+ .footer {{
517
+ text-align: center;
518
+ color: #666;
519
+ margin-top: 40px;
520
+ padding-top: 20px;
521
+ border-top: 1px solid #ddd;
522
+ }}
523
+ </style>
524
+ </head>
525
+ <body>
526
+ <div class="header">
527
+ <h1>🚀 GEO Analysis Report</h1>
528
+ <p>Generative Engine Optimization Performance Analysis</p>
529
+ <p><strong>Website:</strong> {website_info.get('url', 'Not specified')}</p>
530
+ <p><strong>Analysis Date:</strong> {website_info.get('analysis_date', 'Not specified')}</p>
531
+ </div>
532
+
533
+ <div class="summary-cards">
534
+ <div class="card">
535
+ <h3>Overall GEO Score</h3>
536
+ <div class="score">{website_info.get('overall_geo_score', 0)}/10</div>
537
+ </div>
538
+ <div class="card">
539
+ <h3>Pages Analyzed</h3>
540
+ <div class="score">{website_info.get('pages_analyzed', 0)}</div>
541
+ </div>
542
+ <div class="card">
543
+ <h3>Recommendations</h3>
544
+ <div class="score">{len(recommendations)}</div>
545
+ </div>
546
+ </div>
547
+
548
+ <h2>📊 Detailed GEO Metrics</h2>
549
+ <div class="scores-grid">
550
+ """
551
+
552
+ # Add individual scores
553
+ for metric, score in scores.items():
554
+ metric_display = metric.replace('_', ' ').title()
555
+ score_percentage = min(score * 10, 100) # Convert to percentage
556
+
557
+ html_content += f"""
558
+ <div class="score-item">
559
+ <div>
560
+ <strong>{metric_display}</strong><br>
561
+ <span style="color: #666;">{score:.1f}/10</span>
562
+ </div>
563
+ <div class="score-bar">
564
+ <div class="score-fill" style="width: {score_percentage}%;"></div>
565
+ </div>
566
+ </div>
567
+ """
568
+
569
+ html_content += """
570
+ </div>
571
+
572
+ <div class="recommendations">
573
+ <h2>💡 Optimization Recommendations</h2>
574
+ """
575
+
576
+ # Add recommendations
577
+ for i, rec in enumerate(recommendations, 1):
578
+ html_content += f'<div class="rec-item"><strong>{i}.</strong> {rec}</div>'
579
+
580
+ html_content += f"""
581
+ </div>
582
+
583
+ <div class="footer">
584
+ <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
585
+ <p>This report provides AI-first SEO optimization insights for better generative engine performance.</p>
586
+ </div>
587
+ </body>
588
+ </html>
589
+ """
590
+
591
+ return html_content
592
+
593
+ except Exception as e:
594
+ return f"<html><body><h1>HTML Export Error</h1><p>{str(e)}</p></body></html>"
595
+
596
+ def _export_geo_excel(self, data: Dict[str, Any]) -> bytes:
597
+ """Export GEO data as Excel file"""
598
+ try:
599
+ output = io.BytesIO()
600
+
601
+ with pd.ExcelWriter(output, engine='openpyxl') as writer:
602
+ # Summary sheet
603
+ summary_data = {
604
+ 'Metric': ['Website URL', 'Analysis Date', 'Pages Analyzed', 'Overall Score'],
605
+ 'Value': [
606
+ data.get('website_analysis', {}).get('url', 'Unknown'),
607
+ data.get('website_analysis', {}).get('analysis_date', 'Unknown'),
608
+ data.get('website_analysis', {}).get('pages_analyzed', 0),
609
+ data.get('website_analysis', {}).get('overall_geo_score', 0)
610
+ ]
611
+ }
612
+ pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)
613
+
614
+ # Scores sheet
615
+ scores_data = []
616
+ for metric, score in data.get('aggregate_scores', {}).items():
617
+ scores_data.append({
618
+ 'Metric': metric.replace('_', ' ').title(),
619
+ 'Score': round(score, 2),
620
+ 'Performance': self._get_performance_level(score)
621
+ })
622
+
623
+ pd.DataFrame(scores_data).to_excel(writer, sheet_name='GEO Scores', index=False)
624
+
625
+ # Recommendations sheet
626
+ rec_data = []
627
+ for i, rec in enumerate(data.get('recommendations', []), 1):
628
+ rec_data.append({
629
+ 'Priority': i,
630
+ 'Recommendation': rec,
631
+ 'Category': self._categorize_recommendation(rec)
632
+ })
633
+
634
+ if rec_data:
635
+ pd.DataFrame(rec_data).to_excel(writer, sheet_name='Recommendations', index=False)
636
+
637
+ # Individual pages sheet
638
+ if data.get('individual_page_results'):
639
+ pages_data = []
640
+ for i, result in enumerate(data['individual_page_results']):
641
+ page_data = result.get('page_data', {})
642
+ scores = result.get('geo_scores', {})
643
+
644
+ page_row = {
645
+ 'Page_Index': i + 1,
646
+ 'URL': page_data.get('url', 'Unknown'),
647
+ 'Title': page_data.get('title', 'Unknown'),
648
+ 'Word_Count': page_data.get('word_count', 0)
649
+ }
650
+
651
+ # Add all GEO scores
652
+ for metric, score in scores.items():
653
+ page_row[metric.replace('_', ' ').title()] = round(score, 2)
654
+
655
+ pages_data.append(page_row)
656
+
657
+ pd.DataFrame(pages_data).to_excel(writer, sheet_name='Individual Pages', index=False)
658
+
659
+ output.seek(0)
660
+ return output.getvalue()
661
+
662
+ except Exception as e:
663
+ # Return error as text file if Excel creation fails
664
+ error_content = f"Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
665
+ return error_content.encode('utf-8')
666
+
667
+ def _export_enhancement_html(self, data: Dict[str, Any]) -> str:
668
+ """Export content enhancement results as HTML"""
669
+ try:
670
+ analysis = data.get('content_analysis', {})
671
+ scores = data.get('performance_scores', {})
672
+ optimization = data.get('optimization_results', {})
673
+
674
+ html_content = f"""
675
+ <!DOCTYPE html>
676
+ <html lang="en">
677
+ <head>
678
+ <meta charset="UTF-8">
679
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
680
+ <title>Content Enhancement Report</title>
681
+ <style>
682
+ body {{
683
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
684
+ line-height: 1.6;
685
+ color: #333;
686
+ max-width: 1000px;
687
+ margin: 0 auto;
688
+ padding: 20px;
689
+ background-color: #f8f9fa;
690
+ }}
691
+ .header {{
692
+ background: linear-gradient(135deg, #28a745 0%, #20c997 100%);
693
+ color: white;
694
+ padding: 30px;
695
+ border-radius: 10px;
696
+ margin-bottom: 30px;
697
+ text-align: center;
698
+ }}
699
+ .scores {{
700
+ display: grid;
701
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
702
+ gap: 20px;
703
+ margin-bottom: 30px;
704
+ }}
705
+ .score-card {{
706
+ background: white;
707
+ padding: 20px;
708
+ border-radius: 10px;
709
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
710
+ text-align: center;
711
+ }}
712
+ .content-section {{
713
+ background: white;
714
+ padding: 30px;
715
+ border-radius: 10px;
716
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
717
+ margin-bottom: 20px;
718
+ }}
719
+ .keywords {{
720
+ display: flex;
721
+ flex-wrap: wrap;
722
+ gap: 10px;
723
+ margin-top: 15px;
724
+ }}
725
+ .keyword {{
726
+ background: #e9ecef;
727
+ padding: 5px 10px;
728
+ border-radius: 20px;
729
+ font-size: 0.9em;
730
+ }}
731
+ .optimized-content {{
732
+ background: #f8f9fa;
733
+ padding: 20px;
734
+ border-left: 4px solid #28a745;
735
+ border-radius: 5px;
736
+ font-style: italic;
737
+ }}
738
+ </style>
739
+ </head>
740
+ <body>
741
+ <div class="header">
742
+ <h1>🔧 Content Enhancement Report</h1>
743
+ <p>AI-Optimized Content Analysis Results</p>
744
+ <p><strong>Analysis Date:</strong> {analysis.get('analysis_date', 'Unknown')}</p>
745
+ </div>
746
+
747
+ <div class="scores">
748
+ <div class="score-card">
749
+ <h3>Clarity Score</h3>
750
+ <div style="font-size: 2em; font-weight: bold; color: #28a745;">
751
+ {scores.get('clarity', 0):.1f}/10
752
+ </div>
753
+ </div>
754
+ <div class="score-card">
755
+ <h3>Structure Score</h3>
756
+ <div style="font-size: 2em; font-weight: bold; color: #28a745;">
757
+ {scores.get('structure', 0):.1f}/10
758
+ </div>
759
+ </div>
760
+ <div class="score-card">
761
+ <h3>Answerability Score</h3>
762
+ <div style="font-size: 2em; font-weight: bold; color: #28a745;">
763
+ {scores.get('answerability', 0):.1f}/10
764
+ </div>
765
+ </div>
766
+ <div class="score-card">
767
+ <h3>Overall Average</h3>
768
+ <div style="font-size: 2em; font-weight: bold; color: #28a745;">
769
+ {scores.get('overall_average', 0):.1f}/10
770
+ </div>
771
+ </div>
772
+ </div>
773
+
774
+ <div class="content-section">
775
+ <h2>🔑 Identified Keywords</h2>
776
+ <div class="keywords">
777
+ {' '.join([f'<span class="keyword">{keyword}</span>' for keyword in optimization.get('keywords_identified', [])])}
778
+ </div>
779
+ </div>
780
+
781
+ {'<div class="content-section"><h2>✨ Optimized Content</h2><div class="optimized-content">' + optimization.get('optimized_content', '') + '</div></div>' if optimization.get('optimized_content') and not optimization.get('analyze_only') else ''}
782
+
783
+ <div class="content-section">
784
+ <h2>💡 Improvements Made</h2>
785
+ <ul>
786
+ {' '.join([f'<li>{improvement}</li>' for improvement in optimization.get('improvements_made', [])])}
787
+ </ul>
788
+ </div>
789
+
790
+ <div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
791
+ <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
792
+ </div>
793
+ </body>
794
+ </html>
795
+ """
796
+
797
+ return html_content
798
+
799
+ except Exception as e:
800
+ return f"<html><body><h1>Enhancement HTML Export Error</h1><p>{str(e)}</p></body></html>"
801
+
802
+ def _export_enhancement_csv(self, data: Dict[str, Any]) -> str:
803
+ """Export content enhancement results as CSV"""
804
+ try:
805
+ output = io.StringIO()
806
+ writer = csv.writer(output)
807
+
808
+ # Header information
809
+ analysis = data.get('content_analysis', {})
810
+ scores = data.get('performance_scores', {})
811
+ optimization = data.get('optimization_results', {})
812
+
813
+ writer.writerow(['Content Enhancement Analysis Report'])
814
+ writer.writerow(['Analysis Date:', analysis.get('analysis_date', 'Unknown')])
815
+ writer.writerow(['Original Content Length:', analysis.get('original_content_length', 0)])
816
+ writer.writerow(['Original Word Count:', analysis.get('original_word_count', 0)])
817
+ writer.writerow([])
818
+
819
+ # Performance scores
820
+ writer.writerow(['Performance Scores'])
821
+ writer.writerow(['Metric', 'Score'])
822
+ for metric, score in scores.items():
823
+ writer.writerow([metric.replace('_', ' ').title(), round(score, 2)])
824
+
825
+ writer.writerow([])
826
+ writer.writerow(['Keywords Identified'])
827
+ for keyword in optimization.get('keywords_identified', []):
828
+ writer.writerow([keyword])
829
+
830
+ writer.writerow([])
831
+ writer.writerow(['Improvements Made'])
832
+ for improvement in optimization.get('improvements_made', []):
833
+ writer.writerow([improvement])
834
+
835
+ return output.getvalue()
836
+
837
+ except Exception as e:
838
+ return f"Enhancement CSV export error: {str(e)}"
839
+
840
+ def _export_qa_html(self, data: Dict[str, Any]) -> str:
841
+ """Export Q&A results as HTML"""
842
+ try:
843
+ session = data.get('qa_session', {})
844
+ summary = data.get('summary', {})
845
+ interactions = session.get('interactions', [])
846
+
847
+ html_content = f"""
848
+ <!DOCTYPE html>
849
+ <html lang="en">
850
+ <head>
851
+ <meta charset="UTF-8">
852
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
853
+ <title>Q&A Session Report</title>
854
+ <style>
855
+ body {{
856
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
857
+ line-height: 1.6;
858
+ color: #333;
859
+ max-width: 1000px;
860
+ margin: 0 auto;
861
+ padding: 20px;
862
+ background-color: #f8f9fa;
863
+ }}
864
+ .header {{
865
+ background: linear-gradient(135deg, #6f42c1 0%, #e83e8c 100%);
866
+ color: white;
867
+ padding: 30px;
868
+ border-radius: 10px;
869
+ margin-bottom: 30px;
870
+ text-align: center;
871
+ }}
872
+ .summary {{
873
+ display: grid;
874
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
875
+ gap: 20px;
876
+ margin-bottom: 30px;
877
+ }}
878
+ .summary-card {{
879
+ background: white;
880
+ padding: 20px;
881
+ border-radius: 10px;
882
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
883
+ text-align: center;
884
+ }}
885
+ .qa-item {{
886
+ background: white;
887
+ padding: 20px;
888
+ border-radius: 10px;
889
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
890
+ margin-bottom: 20px;
891
+ }}
892
+ .question {{
893
+ background: #e9ecef;
894
+ padding: 15px;
895
+ border-left: 4px solid #6f42c1;
896
+ border-radius: 5px;
897
+ margin-bottom: 15px;
898
+ }}
899
+ .answer {{
900
+ padding: 15px;
901
+ border-left: 4px solid #28a745;
902
+ border-radius: 5px;
903
+ background: #f8f9fa;
904
+ }}
905
+ .sources {{
906
+ margin-top: 15px;
907
+ padding: 10px;
908
+ background: #fff3cd;
909
+ border-radius: 5px;
910
+ font-size: 0.9em;
911
+ }}
912
+ </style>
913
+ </head>
914
+ <body>
915
+ <div class="header">
916
+ <h1>💬 Q&A Session Report</h1>
917
+ <p>Document Question & Answer Analysis</p>
918
+ <p><strong>Session Date:</strong> {session.get('session_date', 'Unknown')}</p>
919
+ </div>
920
+
921
+ <div class="summary">
922
+ <div class="summary-card">
923
+ <h3>Total Questions</h3>
924
+ <div style="font-size: 2em; font-weight: bold; color: #6f42c1;">
925
+ {session.get('total_questions', 0)}
926
+ </div>
927
+ </div>
928
+ <div class="summary-card">
929
+ <h3>Successful Answers</h3>
930
+ <div style="font-size: 2em; font-weight: bold; color: #28a745;">
931
+ {summary.get('successful_answers', 0)}
932
+ </div>
933
+ </div>
934
+ <div class="summary-card">
935
+ <h3>Avg Response Length</h3>
936
+ <div style="font-size: 2em; font-weight: bold; color: #17a2b8;">
937
+ {summary.get('average_response_length', 0):.0f}
938
+ </div>
939
+ </div>
940
+ </div>
941
+
942
+ <h2>📝 Q&A Interactions</h2>
943
+ """
944
+
945
+ # Add individual Q&A items
946
+ for i, interaction in enumerate(interactions, 1):
947
+ question = interaction.get('query', 'No question')
948
+ answer = interaction.get('result', interaction.get('answer', 'No answer'))
949
+ sources = interaction.get('sources', [])
950
+
951
+ html_content += f"""
952
+ <div class="qa-item">
953
+ <h3>Question {i}</h3>
954
+ <div class="question">
955
+ <strong>Q:</strong> {question}
956
+ </div>
957
+ <div class="answer">
958
+ <strong>A:</strong> {answer}
959
+ </div>
960
+ """
961
+
962
+ if sources:
963
+ html_content += '<div class="sources"><strong>Sources:</strong><ul>'
964
+ for source in sources[:3]: # Limit to first 3 sources
965
+ content_preview = source.get('content', '')[:200] + '...' if len(source.get('content', '')) > 200 else source.get('content', '')
966
+ html_content += f'<li>{content_preview}</li>'
967
+ html_content += '</ul></div>'
968
+
969
+ html_content += '</div>'
970
+
971
+ html_content += f"""
972
+
973
+ <div style="text-align: center; color: #666; margin-top: 40px; padding-top: 20px; border-top: 1px solid #ddd;">
974
+ <p>Generated by GEO SEO AI Optimizer | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
975
+ </div>
976
+ </body>
977
+ </html>
978
+ """
979
+
980
+ return html_content
981
+
982
+ except Exception as e:
983
+ return f"<html><body><h1>Q&A HTML Export Error</h1><p>{str(e)}</p></body></html>"
984
+
985
+ def _export_qa_csv(self, data: Dict[str, Any]) -> str:
986
+ """Export Q&A results as CSV"""
987
+ try:
988
+ output = io.StringIO()
989
+ writer = csv.writer(output)
990
+
991
+ session = data.get('qa_session', {})
992
+ summary = data.get('summary', {})
993
+ interactions = session.get('interactions', [])
994
+
995
+ # Header
996
+ writer.writerow(['Q&A Session Report'])
997
+ writer.writerow(['Session Date:', session.get('session_date', 'Unknown')])
998
+ writer.writerow(['Total Questions:', session.get('total_questions', 0)])
999
+ writer.writerow(['Successful Answers:', summary.get('successful_answers', 0)])
1000
+ writer.writerow([])
1001
+
1002
+ # Q&A data
1003
+ writer.writerow(['Question Index', 'Question', 'Answer', 'Has Sources', 'Answer Length'])
1004
+
1005
+ for i, interaction in enumerate(interactions, 1):
1006
+ question = interaction.get('query', 'No question')
1007
+ answer = interaction.get('result', interaction.get('answer', 'No answer'))
1008
+ has_sources = 'Yes' if interaction.get('sources') else 'No'
1009
+ answer_length = len(answer) if answer else 0
1010
+
1011
+ writer.writerow([i, question, answer, has_sources, answer_length])
1012
+
1013
+ return output.getvalue()
1014
+
1015
+ except Exception as e:
1016
+ return f"Q&A CSV export error: {str(e)}"
1017
+
1018
+ def _export_batch_excel(self, data: Dict[str, Any]) -> bytes:
1019
+ """Export batch results as Excel file"""
1020
+ try:
1021
+ output = io.BytesIO()
1022
+
1023
+ with pd.ExcelWriter(output, engine='openpyxl') as writer:
1024
+ # Batch metadata sheet
1025
+ metadata = data.get('batch_metadata', {})
1026
+ metadata_df = pd.DataFrame([
1027
+ {'Property': k, 'Value': v} for k, v in metadata.items()
1028
+ ])
1029
+ metadata_df.to_excel(writer, sheet_name='Batch Metadata', index=False)
1030
+
1031
+ # Batch summary sheet
1032
+ summary = data.get('batch_summary', {})
1033
+ summary_df = pd.DataFrame([
1034
+ {'Metric': k, 'Value': v} for k, v in summary.items()
1035
+ ])
1036
+ summary_df.to_excel(writer, sheet_name='Batch Summary', index=False)
1037
+
1038
+ # Individual results sheet
1039
+ results = data.get('batch_results', [])
1040
+ if results:
1041
+ # Flatten results for tabular format
1042
+ flattened_results = []
1043
+ for i, result in enumerate(results):
1044
+ flat_result = {'Batch_Index': i}
1045
+ self._flatten_dict(result, flat_result)
1046
+ flattened_results.append(flat_result)
1047
+
1048
+ results_df = pd.DataFrame(flattened_results)
1049
+ results_df.to_excel(writer, sheet_name='Batch Results', index=False)
1050
+
1051
+ output.seek(0)
1052
+ return output.getvalue()
1053
+
1054
+ except Exception as e:
1055
+ error_content = f"Batch Excel export failed: {str(e)}\n\nData:\n{json.dumps(data, indent=2)}"
1056
+ return error_content.encode('utf-8')
1057
+
1058
+ def _export_batch_csv(self, data: Dict[str, Any]) -> str:
1059
+ """Export batch results as CSV"""
1060
+ try:
1061
+ output = io.StringIO()
1062
+ writer = csv.writer(output)
1063
+
1064
+ # Batch metadata
1065
+ metadata = data.get('batch_metadata', {})
1066
+ writer.writerow(['Batch Analysis Results'])
1067
+ writer.writerow(['Export Timestamp:', data.get('export_timestamp', 'Unknown')])
1068
+ writer.writerow([])
1069
+
1070
+ writer.writerow(['Batch Metadata'])
1071
+ for key, value in metadata.items():
1072
+ writer.writerow([key, value])
1073
+
1074
+ writer.writerow([])
1075
+
1076
+ # Batch summary
1077
+ summary = data.get('batch_summary', {})
1078
+ writer.writerow(['Batch Summary'])
1079
+ for key, value in summary.items():
1080
+ writer.writerow([key, value])
1081
+
1082
+ writer.writerow([])
1083
+
1084
+ # Individual results (simplified)
1085
+ results = data.get('batch_results', [])
1086
+ if results:
1087
+ writer.writerow(['Individual Results'])
1088
+ writer.writerow(['Index', 'Status', 'Summary'])
1089
+
1090
+ for i, result in enumerate(results):
1091
+ status = 'Success' if not result.get('error') else 'Error'
1092
+ summary_text = str(result)[:100] + '...' if len(str(result)) > 100 else str(result)
1093
+ writer.writerow([i, status, summary_text])
1094
+
1095
+ return output.getvalue()
1096
+
1097
+ except Exception as e:
1098
+ return f"Batch CSV export error: {str(e)}"
1099
+
1100
+ def _export_geo_pdf(self, data: Dict[str, Any]) -> bytes:
1101
+ """Export GEO data as PDF (placeholder - would need reportlab)"""
1102
+ try:
1103
+ # For now, return HTML content as bytes
1104
+ # In a full implementation, you'd use reportlab or weasyprint
1105
+ html_content = self._export_geo_html(data)
1106
+ return html_content.encode('utf-8')
1107
+
1108
+ except Exception as e:
1109
+ error_content = f"PDF export not fully implemented. Error: {str(e)}"
1110
+ return error_content.encode('utf-8')
1111
+
1112
+ def _create_executive_summary(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
1113
+ """Create executive summary report"""
1114
+ try:
1115
+ geo_results = analysis_data.get('geo_results', [])
1116
+ enhancement_results = analysis_data.get('enhancement_results', {})
1117
+ qa_results = analysis_data.get('qa_results', [])
1118
+
1119
+ # Calculate key metrics
1120
+ overall_performance = self._calculate_overall_performance(analysis_data)
1121
+
1122
+ return {
1123
+ 'executive_summary': {
1124
+ 'overall_performance_score': overall_performance,
1125
+ 'key_findings': self._extract_key_findings(analysis_data),
1126
+ 'priority_recommendations': self._get_priority_recommendations(analysis_data),
1127
+ 'roi_potential': self._estimate_roi_potential(overall_performance),
1128
+ 'implementation_timeline': self._suggest_implementation_timeline(analysis_data),
1129
+ 'resource_requirements': self._estimate_resource_requirements(analysis_data)
1130
+ }
1131
+ }
1132
+
1133
+ except Exception as e:
1134
+ return {'error': f"Executive summary creation failed: {str(e)}"}
1135
+
1136
+ def _create_summary_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
1137
+ """Create summary report"""
1138
+ try:
1139
+ return {
1140
+ 'summary_report': {
1141
+ 'analysis_overview': self._create_analysis_overview(analysis_data),
1142
+ 'performance_metrics': self._summarize_performance_metrics(analysis_data),
1143
+ 'improvement_opportunities': self._identify_improvement_opportunities(analysis_data),
1144
+ 'competitive_position': self._assess_competitive_position(analysis_data),
1145
+ 'next_steps': self._recommend_next_steps(analysis_data)
1146
+ }
1147
+ }
1148
+
1149
+ except Exception as e:
1150
+ return {'error': f"Summary report creation failed: {str(e)}"}
1151
+
1152
+ def _create_full_report(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
1153
+ """Create full detailed report"""
1154
+ try:
1155
+ return {
1156
+ 'full_report': {
1157
+ 'executive_summary': self._create_executive_summary(analysis_data).get('executive_summary', {}),
1158
+ 'detailed_analysis': {
1159
+ 'geo_analysis_details': analysis_data.get('geo_results', []),
1160
+ 'content_optimization_details': analysis_data.get('enhancement_results', {}),
1161
+ 'qa_performance_details': analysis_data.get('qa_results', [])
1162
+ },
1163
+ 'methodology': self._document_methodology(),
1164
+ 'data_sources': self._document_data_sources(analysis_data),
1165
+ 'limitations': self._document_limitations(),
1166
+ 'appendices': self._create_appendices(analysis_data)
1167
+ }
1168
+ }
1169
+
1170
+ except Exception as e:
1171
+ return {'error': f"Full report creation failed: {str(e)}"}
1172
+
1173
+ def _create_batch_summary(self, batch_results: List[Dict[str, Any]]) -> Dict[str, Any]:
1174
+ """Create summary of batch processing results"""
1175
+ try:
1176
+ total_items = len(batch_results)
1177
+ successful_items = len([r for r in batch_results if not r.get('error')])
1178
+ failed_items = total_items - successful_items
1179
+
1180
+ return {
1181
+ 'total_items': total_items,
1182
+ 'successful_items': successful_items,
1183
+ 'failed_items': failed_items,
1184
+ 'success_rate': (successful_items / total_items * 100) if total_items > 0 else 0,
1185
+ 'processing_status': 'Completed',
1186
+ 'average_processing_time': self._calculate_avg_processing_time(batch_results),
1187
+ 'common_errors': self._identify_common_errors(batch_results)
1188
+ }
1189
+
1190
+ except Exception as e:
1191
+ return {'error': f"Batch summary creation failed: {str(e)}"}
1192
+
1193
+ def _generate_performance_insights(self, scores: Dict[str, float], overall_avg: float) -> List[str]:
1194
+ """Generate performance insights from scores"""
1195
+ insights = []
1196
+
1197
+ try:
1198
+ # Overall performance insight
1199
+ if overall_avg >= 8.0:
1200
+ insights.append("Excellent overall GEO performance - content is well-optimized for AI search engines")
1201
+ elif overall_avg >= 6.0:
1202
+ insights.append("Good GEO performance with room for improvement in specific areas")
1203
+ elif overall_avg >= 4.0:
1204
+ insights.append("Moderate GEO performance - significant optimization opportunities exist")
1205
+ else:
1206
+ insights.append("Low GEO performance - comprehensive optimization needed")
1207
+
1208
+ # Specific metric insights
1209
+ for metric, score in scores.items():
1210
+ if score < 5.0:
1211
+ metric_name = metric.replace('_', ' ').title()
1212
+ insights.append(f"Low {metric_name} score ({score:.1f}) needs immediate attention")
1213
+ elif score >= 8.5:
1214
+ metric_name = metric.replace('_', ' ').title()
1215
+ insights.append(f"Excellent {metric_name} score ({score:.1f}) - maintain current approach")
1216
+
1217
+ return insights[:5] # Return top 5 insights
1218
+
1219
+ except Exception:
1220
+ return ["Unable to generate performance insights"]
1221
+
1222
+ def _generate_package_readme(self, analysis_data: Dict[str, Any]) -> str:
1223
+ """Generate README file for export package"""
1224
+ try:
1225
+ readme_content = f"""
1226
+ GEO SEO AI Optimizer - Analysis Package
1227
+ ======================================
1228
+
1229
+ Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
1230
+
1231
+ This package contains the complete analysis results from the GEO SEO AI Optimizer tool.
1232
+
1233
+ Files Included:
1234
+ - JSON file: Complete raw data in JSON format
1235
+ - HTML file: Visual report for web viewing
1236
+ - CSV file: Tabular data for spreadsheet analysis
1237
+ - README.txt: This file
1238
+
1239
+ About GEO (Generative Engine Optimization):
1240
+ GEO is the practice of optimizing content for AI-powered search engines and
1241
+ language models. Unlike traditional SEO, GEO focuses on:
1242
+
1243
+ - AI search visibility
1244
+ - Query intent matching
1245
+ - Conversational readiness
1246
+ - Citation worthiness
1247
+ - Semantic richness
1248
+ - Context completeness
1249
+
1250
+ How to Use These Files:
1251
+ 1. Open the HTML file in a web browser for a visual report
1252
+ 2. Import the CSV file into Excel or Google Sheets for analysis
1253
+ 3. Use the JSON file for programmatic processing or integration
1254
+
1255
+ For more information about GEO optimization, visit the tool documentation.
1256
+
1257
+ Generated by: GEO SEO AI Optimizer v1.0
1258
+ """
1259
+ return readme_content
1260
+
1261
+ except Exception as e:
1262
+ return f"README generation failed: {str(e)}"
1263
+
1264
+ # Helper methods for data processing and analysis
1265
+
1266
+ def _get_performance_level(self, score: float) -> str:
1267
+ """Get performance level description for a score"""
1268
+ if score >= 8.0:
1269
+ return "Excellent"
1270
+ elif score >= 6.0:
1271
+ return "Good"
1272
+ elif score >= 4.0:
1273
+ return "Fair"
1274
+ else:
1275
+ return "Needs Improvement"
1276
+
1277
+ def _categorize_recommendation(self, recommendation: str) -> str:
1278
+ """Categorize a recommendation based on content"""
1279
+ rec_lower = recommendation.lower()
1280
+
1281
+ if any(word in rec_lower for word in ['structure', 'heading', 'format']):
1282
+ return "Content Structure"
1283
+ elif any(word in rec_lower for word in ['keyword', 'semantic', 'topic']):
1284
+ return "SEO & Keywords"
1285
+ elif any(word in rec_lower for word in ['clarity', 'readability', 'language']):
1286
+ return "Content Quality"
1287
+ elif any(word in rec_lower for word in ['technical', 'schema', 'markup']):
1288
+ return "Technical SEO"
1289
+ else:
1290
+ return "General"
1291
+
1292
+ def _calculate_avg_response_length(self, qa_results: List[Dict[str, Any]]) -> float:
1293
+ """Calculate average response length for Q&A results"""
1294
+ try:
1295
+ response_lengths = []
1296
+ for result in qa_results:
1297
+ answer = result.get('result', result.get('answer', ''))
1298
+ if answer and not result.get('error'):
1299
+ response_lengths.append(len(answer))
1300
+
1301
+ return sum(response_lengths) / len(response_lengths) if response_lengths else 0
1302
+
1303
+ except Exception:
1304
+ return 0
1305
+
1306
+ def _extract_common_topics(self, qa_results: List[Dict[str, Any]]) -> List[str]:
1307
+ """Extract common topics from Q&A results"""
1308
+ try:
1309
+ # Simple topic extraction based on question keywords
1310
+ topics = {}
1311
+
1312
+ for result in qa_results:
1313
+ question = result.get('query', result.get('question', ''))
1314
+ if question:
1315
+ words = question.lower().split()
1316
+ for word in words:
1317
+ if len(word) > 4: # Focus on longer words
1318
+ topics[word] = topics.get(word, 0) + 1
1319
+
1320
+ # Return top 5 most common topics
1321
+ sorted_topics = sorted(topics.items(), key=lambda x: x[1], reverse=True)
1322
+ return [topic for topic, count in sorted_topics[:5]]
1323
+
1324
+ except Exception:
1325
+ return []
1326
+
1327
+ def _flatten_dict(self, d: Dict[str, Any], parent_dict: Dict[str, Any], parent_key: str = '') -> None:
1328
+ """Flatten nested dictionary for tabular export"""
1329
+ try:
1330
+ for key, value in d.items():
1331
+ new_key = f"{parent_key}_{key}" if parent_key else key
1332
+
1333
+ if isinstance(value, dict):
1334
+ self._flatten_dict(value, parent_dict, new_key)
1335
+ elif isinstance(value, list):
1336
+ parent_dict[new_key] = json.dumps(value) # Convert lists to JSON strings
1337
+ else:
1338
+ parent_dict[new_key] = value
1339
+
1340
+ except Exception:
1341
+ pass # Skip problematic keys
1342
+
1343
+ def _calculate_overall_performance(self, analysis_data: Dict[str, Any]) -> float:
1344
+ """Calculate overall performance score across all analyses"""
1345
+ try:
1346
+ scores = []
1347
+
1348
+ # GEO scores
1349
+ geo_results = analysis_data.get('geo_results', [])
1350
+ for result in geo_results:
1351
+ if 'geo_scores' in result:
1352
+ geo_score_values = list(result['geo_scores'].values())
1353
+ if geo_score_values:
1354
+ scores.append(sum(geo_score_values) / len(geo_score_values))
1355
+
1356
+ # Enhancement scores
1357
+ enhancement = analysis_data.get('enhancement_results', {})
1358
+ if 'scores' in enhancement:
1359
+ enh_scores = list(enhancement['scores'].values())
1360
+ if enh_scores:
1361
+ scores.append(sum(enh_scores) / len(enh_scores))
1362
+
1363
+ return sum(scores) / len(scores) if scores else 0
1364
+
1365
+ except Exception:
1366
+ return 0
1367
+
1368
+ def _extract_key_findings(self, analysis_data: Dict[str, Any]) -> List[str]:
1369
+ """Extract key findings from analysis data"""
1370
+ findings = []
1371
+
1372
+ try:
1373
+ # Add findings based on performance scores
1374
+ overall_perf = self._calculate_overall_performance(analysis_data)
1375
+
1376
+ if overall_perf >= 8.0:
1377
+ findings.append("Content demonstrates excellent AI search optimization")
1378
+ elif overall_perf <= 4.0:
1379
+ findings.append("Significant optimization opportunities identified")
1380
+
1381
+ # Add more specific findings based on data
1382
+ geo_results = analysis_data.get('geo_results', [])
1383
+ if geo_results:
1384
+ findings.append(f"Analyzed {len(geo_results)} pages for GEO performance")
1385
+
1386
+ enhancement = analysis_data.get('enhancement_results', {})
1387
+ if enhancement and 'keywords' in enhancement:
1388
+ findings.append(f"Identified {len(enhancement['keywords'])} key optimization terms")
1389
+
1390
+ return findings[:5] # Return top 5 findings
1391
+
1392
+ except Exception:
1393
+ return ["Unable to extract key findings"]
1394
+
1395
+ def _get_priority_recommendations(self, analysis_data: Dict[str, Any]) -> List[str]:
1396
+ """Get priority recommendations from analysis"""
1397
+ try:
1398
+ recommendations = []
1399
+
1400
+ # Collect all recommendations from different analyses
1401
+ geo_results = analysis_data.get('geo_results', [])
1402
+ for result in geo_results:
1403
+ recommendations.extend(result.get('recommendations', []))
1404
+
1405
+ # Remove duplicates and return top priorities
1406
+ unique_recs = list(set(recommendations))
1407
+ return unique_recs[:3] # Top 3 priority recommendations
1408
+
1409
+ except Exception:
1410
+ return ["Review and implement GEO best practices"]
1411
+
1412
+ def _estimate_roi_potential(self, performance_score: float) -> str:
1413
+ """Estimate ROI potential based on performance score"""
1414
+ if performance_score <= 4.0:
1415
+ return "High - Significant improvement potential"
1416
+ elif performance_score <= 6.0:
1417
+ return "Medium - Moderate improvement opportunities"
1418
+ else:
1419
+ return "Low - Already well-optimized"
1420
+
1421
+ def _suggest_implementation_timeline(self, analysis_data: Dict[str, Any]) -> str:
1422
+ """Suggest implementation timeline"""
1423
+ try:
1424
+ overall_perf = self._calculate_overall_performance(analysis_data)
1425
+
1426
+ if overall_perf <= 4.0:
1427
+ return "3-6 months for comprehensive optimization"
1428
+ elif overall_perf <= 6.0:
1429
+ return "1-3 months for targeted improvements"
1430
+ else:
1431
+ return "Ongoing maintenance and monitoring"
1432
+
1433
+ except Exception:
1434
+ return "Timeline assessment unavailable"
1435
+
1436
+ def _estimate_resource_requirements(self, analysis_data: Dict[str, Any]) -> Dict[str, str]:
1437
+ """Estimate resource requirements"""
1438
+ return {
1439
+ 'content_team': 'Required for content optimization',
1440
+ 'technical_team': 'Required for technical implementations',
1441
+ 'timeline': self._suggest_implementation_timeline(analysis_data),
1442
+ 'budget': 'Varies based on scope of optimizations'
1443
+ }
1444
+
1445
+ def _create_analysis_overview(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
1446
+ """Create analysis overview"""
1447
+ try:
1448
+ return {
1449
+ 'analyses_performed': list(analysis_data.keys()),
1450
+ 'total_items_analyzed': sum(len(v) if isinstance(v, list) else 1 for v in analysis_data.values()),
1451
+ 'analysis_scope': 'Comprehensive GEO and content optimization analysis',
1452
+ 'key_focus_areas': ['AI Search Optimization', 'Content Enhancement', 'Performance Analysis']
1453
+ }
1454
+
1455
+ except Exception:
1456
+ return {'error': 'Overview creation failed'}
1457
+
1458
+ def _summarize_performance_metrics(self, analysis_data: Dict[str, Any]) -> Dict[str, float]:
1459
+ """Summarize performance metrics"""
1460
+ try:
1461
+ return {
1462
+ 'overall_performance': self._calculate_overall_performance(analysis_data),
1463
+ 'optimization_potential': 10 - self._calculate_overall_performance(analysis_data),
1464
+ 'completion_rate': 100.0 # Assuming analysis completed successfully
1465
+ }
1466
+
1467
+ except Exception:
1468
+ return {}
1469
+
1470
+ def _identify_improvement_opportunities(self, analysis_data: Dict[str, Any]) -> List[str]:
1471
+ """Identify improvement opportunities"""
1472
+ return self._get_priority_recommendations(analysis_data)
1473
+
1474
+ def _assess_competitive_position(self, analysis_data: Dict[str, Any]) -> str:
1475
+ """Assess competitive position"""
1476
+ try:
1477
+ overall_perf = self._calculate_overall_performance(analysis_data)
1478
+
1479
+ if overall_perf >= 8.0:
1480
+ return "Strong - Above average GEO performance"
1481
+ elif overall_perf >= 6.0:
1482
+ return "Competitive - Meeting industry standards"
1483
+ elif overall_perf >= 4.0:
1484
+ return "Below Average - Improvement needed"
1485
+ else:
1486
+ return "Weak - Significant optimization required"
1487
+
1488
+ except Exception:
1489
+ return "Assessment unavailable"
1490
+
1491
+ def _recommend_next_steps(self, analysis_data: Dict[str, Any]) -> List[str]:
1492
+ """Recommend next steps"""
1493
+ steps = [
1494
+ "Review detailed analysis results",
1495
+ "Prioritize recommendations by impact",
1496
+ "Develop implementation plan",
1497
+ "Monitor performance improvements"
1498
+ ]
1499
+
1500
+ # Add specific steps based on performance
1501
+ overall_perf = self._calculate_overall_performance(analysis_data)
1502
+ if overall_perf <= 4.0:
1503
+ steps.insert(1, "Focus on fundamental GEO optimization")
1504
+
1505
+ return steps
1506
+
1507
+ def _document_methodology(self) -> Dict[str, str]:
1508
+ """Document analysis methodology"""
1509
+ return {
1510
+ 'geo_analysis': 'AI-powered content analysis using specialized GEO metrics',
1511
+ 'content_optimization': 'LLM-based content enhancement and scoring',
1512
+ 'performance_scoring': 'Multi-dimensional scoring system for AI search optimization',
1513
+ 'data_collection': 'Automated content parsing and analysis',
1514
+ 'validation': 'Cross-referenced metrics and quality assurance checks'
1515
+ }
1516
+
1517
+ def _document_data_sources(self, analysis_data: Dict[str, Any]) -> List[str]:
1518
+ """Document data sources used in analysis"""
1519
+ sources = []
1520
+
1521
+ if 'geo_results' in analysis_data:
1522
+ sources.append("Website content analysis")
1523
+ if 'enhancement_results' in analysis_data:
1524
+ sources.append("Content optimization analysis")
1525
+ if 'qa_results' in analysis_data:
1526
+ sources.append("Document Q&A interactions")
1527
+
1528
+ sources.extend([
1529
+ "AI-powered content scoring",
1530
+ "GEO performance metrics",
1531
+ "Industry best practices database"
1532
+ ])
1533
+
1534
+ return sources
1535
+
1536
+ def _document_limitations(self) -> List[str]:
1537
+ """Document analysis limitations"""
1538
+ return [
1539
+ "Analysis based on current content snapshot",
1540
+ "Performance may vary with search engine algorithm updates",
1541
+ "Recommendations require human review for implementation",
1542
+ "Results depend on quality of input content",
1543
+ "AI model performance may vary across different content types"
1544
+ ]
1545
+
1546
+ def _create_appendices(self, analysis_data: Dict[str, Any]) -> Dict[str, Any]:
1547
+ """Create report appendices"""
1548
+ try:
1549
+ return {
1550
+ 'technical_details': {
1551
+ 'models_used': ['GPT-based content analysis', 'Semantic similarity scoring'],
1552
+ 'processing_time': 'Variable based on content volume',
1553
+ 'confidence_intervals': 'Scores provided with ±0.5 accuracy'
1554
+ },
1555
+ 'glossary': {
1556
+ 'GEO': 'Generative Engine Optimization - optimization for AI search engines',
1557
+ 'AI Search Visibility': 'Likelihood of content appearing in AI search results',
1558
+ 'Citation Worthiness': 'Probability of content being cited by AI systems',
1559
+ 'Conversational Readiness': 'Suitability for AI chat responses'
1560
+ },
1561
+ 'references': [
1562
+ 'GEO Best Practices Guide',
1563
+ 'AI Search Engine Optimization Standards',
1564
+ 'Content Performance Benchmarks'
1565
+ ]
1566
+ }
1567
+
1568
+ except Exception:
1569
+ return {}
1570
+
1571
+ def _calculate_avg_processing_time(self, batch_results: List[Dict[str, Any]]) -> float:
1572
+ """Calculate average processing time for batch results"""
1573
+ try:
1574
+ processing_times = []
1575
+
1576
+ for result in batch_results:
1577
+ if 'processing_time' in result:
1578
+ processing_times.append(result['processing_time'])
1579
+
1580
+ return sum(processing_times) / len(processing_times) if processing_times else 0
1581
+
1582
+ except Exception:
1583
+ return 0
1584
+
1585
+ def _identify_common_errors(self, batch_results: List[Dict[str, Any]]) -> List[str]:
1586
+ """Identify common errors in batch processing"""
1587
+ try:
1588
+ error_counts = {}
1589
+
1590
+ for result in batch_results:
1591
+ if result.get('error'):
1592
+ error_msg = str(result['error'])[:50] # First 50 chars
1593
+ error_counts[error_msg] = error_counts.get(error_msg, 0) + 1
1594
+
1595
+ # Return top 3 most common errors
1596
+ sorted_errors = sorted(error_counts.items(), key=lambda x: x[1], reverse=True)
1597
+ return [error for error, count in sorted_errors[:3]]
1598
+
1599
+ except Exception:
1600
+ return []
1601
+
1602
+
1603
+ class DataValidator:
1604
+ """Helper class for validating export data"""
1605
+
1606
+ @staticmethod
1607
+ def validate_geo_data(geo_results: List[Dict[str, Any]]) -> Dict[str, Any]:
1608
+ """Validate GEO analysis data structure"""
1609
+ validation_result = {
1610
+ 'valid': True,
1611
+ 'errors': [],
1612
+ 'warnings': []
1613
+ }
1614
+
1615
+ try:
1616
+ if not geo_results:
1617
+ validation_result['errors'].append("No GEO results provided")
1618
+ validation_result['valid'] = False
1619
+ return validation_result
1620
+
1621
+ for i, result in enumerate(geo_results):
1622
+ # Check required fields
1623
+ if 'geo_scores' not in result:
1624
+ validation_result['warnings'].append(f"Result {i} missing geo_scores")
1625
+
1626
+ if 'page_data' not in result:
1627
+ validation_result['warnings'].append(f"Result {i} missing page_data")
1628
+
1629
+ # Validate score ranges
1630
+ if 'geo_scores' in result:
1631
+ for metric, score in result['geo_scores'].items():
1632
+ if not isinstance(score, (int, float)) or score < 0 or score > 10:
1633
+ validation_result['errors'].append(f"Invalid score for {metric} in result {i}")
1634
+ validation_result['valid'] = False
1635
+
1636
+ return validation_result
1637
+
1638
+ except Exception as e:
1639
+ validation_result['errors'].append(f"Validation failed: {str(e)}")
1640
+ validation_result['valid'] = False
1641
+ return validation_result
1642
+
1643
+ @staticmethod
1644
+ def validate_enhancement_data(enhancement_result: Dict[str, Any]) -> Dict[str, Any]:
1645
+ """Validate content enhancement data structure"""
1646
+ validation_result = {
1647
+ 'valid': True,
1648
+ 'errors': [],
1649
+ 'warnings': []
1650
+ }
1651
+
1652
+ try:
1653
+ # Check for required fields
1654
+ if 'scores' not in enhancement_result:
1655
+ validation_result['warnings'].append("Enhancement result missing scores")
1656
+
1657
+ # Validate score structure
1658
+ if 'scores' in enhancement_result:
1659
+ scores = enhancement_result['scores']
1660
+ required_scores = ['clarity', 'structuredness', 'answerability']
1661
+
1662
+ for req_score in required_scores:
1663
+ if req_score not in scores:
1664
+ validation_result['warnings'].append(f"Missing {req_score} score")
1665
+ elif not isinstance(scores[req_score], (int, float)):
1666
+ validation_result['errors'].append(f"Invalid {req_score} score type")
1667
+ validation_result['valid'] = False
1668
+
1669
+ return validation_result
1670
+
1671
+ except Exception as e:
1672
+ validation_result['errors'].append(f"Enhancement validation failed: {str(e)}")
1673
+ validation_result['valid'] = False
1674
+ return validation_result
1675
+
1676
+
1677
+ class ExportManager:
1678
+ """High-level export management class"""
1679
+
1680
+ def __init__(self):
1681
+ self.exporter = ResultExporter()
1682
+ self.validator = DataValidator()
1683
+ self.export_history = []
1684
+
1685
+ def export_with_validation(self, data: Dict[str, Any], data_type: str,
1686
+ format_type: str = 'json') -> Dict[str, Any]:
1687
+ """Export data with validation"""
1688
+ try:
1689
+ # Validate data first
1690
+ if data_type == 'geo_analysis':
1691
+ validation = self.validator.validate_geo_data(data.get('geo_results', []))
1692
+ elif data_type == 'content_optimization':
1693
+ validation = self.validator.validate_enhancement_data(data)
1694
+ else:
1695
+ validation = {'valid': True, 'errors': [], 'warnings': []}
1696
+
1697
+ # Proceed with export if validation passes
1698
+ if validation['valid']:
1699
+ if data_type == 'geo_analysis':
1700
+ result = self.exporter.export_geo_results(
1701
+ data.get('geo_results', []),
1702
+ data.get('website_url', 'unknown'),
1703
+ format_type
1704
+ )
1705
+ elif data_type == 'content_optimization':
1706
+ result = self.exporter.export_enhancement_results(data, format_type)
1707
+ else:
1708
+ result = json.dumps(data, indent=2, ensure_ascii=False)
1709
+
1710
+ # Log export
1711
+ self.export_history.append({
1712
+ 'timestamp': datetime.now().isoformat(),
1713
+ 'data_type': data_type,
1714
+ 'format_type': format_type,
1715
+ 'validation_warnings': validation.get('warnings', []),
1716
+ 'success': True
1717
+ })
1718
+
1719
+ return {
1720
+ 'success': True,
1721
+ 'data': result,
1722
+ 'validation': validation
1723
+ }
1724
+ else:
1725
+ return {
1726
+ 'success': False,
1727
+ 'error': 'Data validation failed',
1728
+ 'validation': validation
1729
+ }
1730
+
1731
+ except Exception as e:
1732
+ self.export_history.append({
1733
+ 'timestamp': datetime.now().isoformat(),
1734
+ 'data_type': data_type,
1735
+ 'format_type': format_type,
1736
+ 'success': False,
1737
+ 'error': str(e)
1738
+ })
1739
+
1740
+ return {
1741
+ 'success': False,
1742
+ 'error': f"Export failed: {str(e)}"
1743
+ }
1744
+
1745
+ def get_export_history(self) -> List[Dict[str, Any]]:
1746
+ """Get export history"""
1747
+ return self.export_history
1748
+
1749
+ def clear_export_history(self) -> None:
1750
+ """Clear export history"""
1751
+ self.export_history.clear()
1752
+
1753
+ def get_supported_formats(self) -> Dict[str, List[str]]:
1754
+ """Get supported export formats by data type"""
1755
+ return {
1756
+ 'geo_analysis': ['json', 'csv', 'html', 'xlsx', 'pdf'],
1757
+ 'content_optimization': ['json', 'html', 'csv'],
1758
+ 'qa_results': ['json', 'html', 'csv'],
1759
+ 'batch_analysis': ['json', 'xlsx', 'csv']
1760
+ }
1761
+
1762
+ def create_multi_format_export(self, data: Dict[str, Any], data_type: str,
1763
+ formats: List[str] = None) -> Dict[str, Any]:
1764
+ """Create export in multiple formats"""
1765
+ if formats is None:
1766
+ formats = ['json', 'html', 'csv']
1767
+
1768
+ results = {}
1769
+
1770
+ for format_type in formats:
1771
+ try:
1772
+ export_result = self.export_with_validation(data, data_type, format_type)
1773
+ if export_result['success']:
1774
+ results[format_type] = export_result['data']
1775
+ else:
1776
+ results[format_type] = {'error': export_result['error']}
1777
+
1778
+ except Exception as e:
1779
+ results[format_type] = {'error': str(e)}
1780
+
1781
+ return {
1782
+ 'multi_format_export': results,
1783
+ 'formats_generated': list(results.keys()),
1784
+ 'successful_formats': [fmt for fmt, data in results.items() if 'error' not in data]
1785
+ }
1786
+
1787
+
1788
+ # Utility functions for the export module
1789
+
1790
+ def create_export_template(data_type: str) -> Dict[str, Any]:
1791
+ """Create export template for different data types"""
1792
+ templates = {
1793
+ 'geo_analysis': {
1794
+ 'website_url': 'https://example.com',
1795
+ 'geo_results': [
1796
+ {
1797
+ 'page_data': {
1798
+ 'url': 'https://example.com/page1',
1799
+ 'title': 'Example Page',
1800
+ 'word_count': 500
1801
+ },
1802
+ 'geo_scores': {
1803
+ 'ai_search_visibility': 7.5,
1804
+ 'query_intent_matching': 6.8,
1805
+ 'conversational_readiness': 8.2,
1806
+ 'citation_worthiness': 7.1
1807
+ },
1808
+ 'recommendations': [
1809
+ 'Improve content structure',
1810
+ 'Add more specific examples'
1811
+ ]
1812
+ }
1813
+ ]
1814
+ },
1815
+ 'content_optimization': {
1816
+ 'scores': {
1817
+ 'clarity': 7.5,
1818
+ 'structuredness': 6.8,
1819
+ 'answerability': 8.2
1820
+ },
1821
+ 'keywords': ['example', 'optimization', 'content'],
1822
+ 'optimized_text': 'This is the optimized version of the content...',
1823
+ 'optimization_suggestions': [
1824
+ 'Improve sentence structure',
1825
+ 'Add more specific keywords'
1826
+ ]
1827
+ },
1828
+ 'qa_results': [
1829
+ {
1830
+ 'query': 'What is the main topic?',
1831
+ 'result': 'The main topic is content optimization for AI systems.',
1832
+ 'sources': [
1833
+ {
1834
+ 'content': 'Source document content...',
1835
+ 'metadata': {'source': 'document1.pdf'}
1836
+ }
1837
+ ]
1838
+ }
1839
+ ]
1840
+ }
1841
+
1842
+ return templates.get(data_type, {})
1843
+
1844
+
1845
+ def export_demo_data() -> Dict[str, Any]:
1846
+ """Export demonstration data for testing"""
1847
+ demo_data = {
1848
+ 'geo_analysis_demo': create_export_template('geo_analysis'),
1849
+ 'content_optimization_demo': create_export_template('content_optimization'),
1850
+ 'qa_results_demo': create_export_template('qa_results')
1851
+ }
1852
+
1853
+ return demo_data
1854
+
1855
+
1856
+ # Export the main classes and functions
1857
+ __all__ = [
1858
+ 'ResultExporter',
1859
+ 'GEOReport',
1860
+ 'ContentAnalysis',
1861
+ 'DataValidator',
1862
+ 'ExportManager',
1863
+ 'create_export_template',
1864
+ 'export_demo_data'
1865
+ ]
1866
+
1867
+
1868
+ # Example usage for testing
1869
+ if __name__ == "__main__":
1870
+ # Create exporter instance
1871
+ exporter = ResultExporter()
1872
+
1873
+ # Test with demo data
1874
+ demo_geo_data = create_export_template('geo_analysis')
1875
+
1876
+ # Export in different formats
1877
+ json_export = exporter.export_geo_results(
1878
+ demo_geo_data['geo_results'],
1879
+ demo_geo_data['website_url'],
1880
+ 'json'
1881
+ )
1882
+
1883
+ html_export = exporter.export_geo_results(
1884
+ demo_geo_data['geo_results'],
1885
+ demo_geo_data['website_url'],
1886
+ 'html'
1887
+ )
1888
+
1889
+ print("JSON Export:", json_export[:200] + "..." if len(str(json_export)) > 200 else json_export)
1890
+ print("\nHTML Export:", html_export[:200] + "..." if len(str(html_export)) > 200 else html_export)
1891
+
1892
+ # Test enhancement export
1893
+ demo_enhancement = create_export_template('content_optimization')
1894
+ enhancement_export = exporter.export_enhancement_results(demo_enhancement, 'json')
1895
+
1896
+ print("\nEnhancement Export:", enhancement_export[:200] + "..." if len(str(enhancement_export)) > 200 else enhancement_export)
utils/optimizer.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Optimization Module
3
+ Enhances content for better AI/LLM performance and GEO scores
4
+ """
5
+
6
+ import json
7
+ import re
8
+ from typing import Dict, Any, List, Optional
9
+ from langchain.prompts import ChatPromptTemplate
10
+
11
+
12
+ class ContentOptimizer:
13
+ """Main class for optimizing content for AI search engines"""
14
+
15
+ def __init__(self, llm):
16
+ self.llm = llm
17
+ self.setup_prompts()
18
+
19
+ def setup_prompts(self):
20
+ """Initialize optimization prompts"""
21
+
22
+ # Main content enhancement prompt
23
+ self.enhancement_prompt = """You are an AI Content Enhancement Specialist. Your purpose is to optimize user-provided text to maximize its effectiveness for large language models (LLMs) in search, question-answering, and conversational AI systems.
24
+
25
+ Evaluate the input text based on the following criteria, assigning a score from 1–10 for each:
26
+
27
+ Clarity: How easily can the content be understood?
28
+ Structuredness: How well-organized and coherent is the content?
29
+ LLM Answerability: How easily can an LLM extract precise answers from the content?
30
+
31
+ Identify the most salient keywords.
32
+
33
+ Rewrite the text to improve:
34
+ - Clarity and precision
35
+ - Logical structure and flow
36
+ - Suitability for LLM-based information retrieval
37
+
38
+ Present your analysis and optimized text in the following JSON format:
39
+
40
+ ```json
41
+ {
42
+ "scores": {
43
+ "clarity": 8.5,
44
+ "structuredness": 7.0,
45
+ "answerability": 9.0
46
+ },
47
+ "keywords": ["example", "installation", "setup"],
48
+ "optimized_text": "..."
49
+ }
50
+ ```"""
51
+
52
+ # SEO-style optimization prompt
53
+ self.seo_style_prompt = """You are an AI-first SEO specialist. Optimize this content for AI search engines and LLM systems.
54
+
55
+ Focus on:
56
+ 1. Semantic keyword optimization
57
+ 2. Question-answer format enhancement
58
+ 3. Factual accuracy and authority signals
59
+ 4. Conversational readiness
60
+ 5. Citation-worthy structure
61
+
62
+ Provide analysis and optimization in JSON:
63
+
64
+ ```json
65
+ {
66
+ "seo_analysis": {
67
+ "keyword_density": "analysis of current keywords",
68
+ "semantic_gaps": ["missing semantic terms"],
69
+ "readability_score": 8.5,
70
+ "authority_signals": ["credentials", "citations"]
71
+ },
72
+ "optimized_content": {
73
+ "title_suggestions": ["optimized title 1", "optimized title 2"],
74
+ "meta_description": "AI-optimized meta description",
75
+ "enhanced_content": "full optimized content...",
76
+ "structured_data_suggestions": ["schema markup recommendations"]
77
+ },
78
+ "improvement_summary": {
79
+ "changes_made": ["change 1", "change 2"],
80
+ "expected_impact": "description of expected improvements"
81
+ }
82
+ }
83
+ ```"""
84
+
85
+ # Competitive content analysis prompt
86
+ self.competitive_analysis_prompt = """Compare this content against best practices for AI search optimization. Identify gaps and opportunities.
87
+
88
+ Original Content: {content}
89
+
90
+ Analyze against these AI search factors:
91
+ - Entity recognition and linking
92
+ - Question coverage completeness
93
+ - Factual statement clarity
94
+ - Conversational flow
95
+ - Semantic relationship mapping
96
+
97
+ Provide competitive analysis in JSON format with specific recommendations."""
98
+
99
+ def optimize_content(self, content: str, analyze_only: bool = False,
100
+ include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]:
101
+ """
102
+ Main content optimization function
103
+
104
+ Args:
105
+ content (str): Content to optimize
106
+ analyze_only (bool): If True, only analyze without rewriting
107
+ include_keywords (bool): Whether to include keyword analysis
108
+ optimization_type (str): Type of optimization ("standard", "seo", "competitive")
109
+
110
+ Returns:
111
+ Dict: Optimization results with scores and enhanced content
112
+ """
113
+ try:
114
+ # Choose optimization approach
115
+ if optimization_type == "seo":
116
+ return self._seo_style_optimization(content, analyze_only)
117
+ elif optimization_type == "competitive":
118
+ return self._competitive_optimization(content)
119
+ else:
120
+ return self._standard_optimization(content, analyze_only, include_keywords)
121
+
122
+ except Exception as e:
123
+ return {'error': f"Optimization failed: {str(e)}"}
124
+
125
+ def _standard_optimization(self, content: str, analyze_only: bool, include_keywords: bool) -> Dict[str, Any]:
126
+ """Standard content optimization using enhancement prompt"""
127
+ try:
128
+ # Modify prompt based on options
129
+ prompt_text = self.enhancement_prompt
130
+
131
+ if analyze_only:
132
+ prompt_text = prompt_text.replace(
133
+ "Rewrite the text to improve:",
134
+ "Analyze the text for potential improvements in:"
135
+ ).replace(
136
+ '"optimized_text": "..."',
137
+ '"optimization_suggestions": ["suggestion 1", "suggestion 2"]'
138
+ )
139
+
140
+ if not include_keywords:
141
+ prompt_text = prompt_text.replace(
142
+ '"keywords": ["example", "installation", "setup"],',
143
+ ''
144
+ )
145
+
146
+ # Create and run chain
147
+ prompt_template = ChatPromptTemplate.from_messages([
148
+ ("system", prompt_text),
149
+ ("user", content[:6000]) # Limit content length
150
+ ])
151
+
152
+ chain = prompt_template | self.llm
153
+ result = chain.invoke({})
154
+
155
+ # Parse result
156
+ result_content = result.content if hasattr(result, 'content') else str(result)
157
+ parsed_result = self._parse_optimization_result(result_content)
158
+
159
+ # Add metadata
160
+ parsed_result.update({
161
+ 'optimization_type': 'standard',
162
+ 'analyze_only': analyze_only,
163
+ 'original_length': len(content),
164
+ 'original_word_count': len(content.split())
165
+ })
166
+
167
+ return parsed_result
168
+
169
+ except Exception as e:
170
+ return {'error': f"Standard optimization failed: {str(e)}"}
171
+
172
+ def _seo_style_optimization(self, content: str, analyze_only: bool) -> Dict[str, Any]:
173
+ """SEO-focused optimization for AI search engines"""
174
+ try:
175
+ prompt_template = ChatPromptTemplate.from_messages([
176
+ ("system", self.seo_style_prompt),
177
+ ("user", f"Optimize this content for AI search engines:\n\n{content[:6000]}")
178
+ ])
179
+
180
+ chain = prompt_template | self.llm
181
+ result = chain.invoke({})
182
+
183
+ result_content = result.content if hasattr(result, 'content') else str(result)
184
+ parsed_result = self._parse_optimization_result(result_content)
185
+
186
+ # Add SEO-specific metadata
187
+ parsed_result.update({
188
+ 'optimization_type': 'seo',
189
+ 'analyze_only': analyze_only,
190
+ 'seo_focused': True
191
+ })
192
+
193
+ return parsed_result
194
+
195
+ except Exception as e:
196
+ return {'error': f"SEO optimization failed: {str(e)}"}
197
+
198
+ def _competitive_optimization(self, content: str) -> Dict[str, Any]:
199
+ """Competitive analysis-based optimization"""
200
+ try:
201
+ formatted_prompt = self.competitive_analysis_prompt.format(content=content[:5000])
202
+
203
+ prompt_template = ChatPromptTemplate.from_messages([
204
+ ("system", formatted_prompt),
205
+ ("user", "Perform the competitive analysis and provide optimization recommendations.")
206
+ ])
207
+
208
+ chain = prompt_template | self.llm
209
+ result = chain.invoke({})
210
+
211
+ result_content = result.content if hasattr(result, 'content') else str(result)
212
+ parsed_result = self._parse_optimization_result(result_content)
213
+
214
+ parsed_result.update({
215
+ 'optimization_type': 'competitive',
216
+ 'competitive_analysis': True
217
+ })
218
+
219
+ return parsed_result
220
+
221
+ except Exception as e:
222
+ return {'error': f"Competitive optimization failed: {str(e)}"}
223
+
224
+ def batch_optimize_content(self, content_list: List[str], optimization_type: str = "standard") -> List[Dict[str, Any]]:
225
+ """
226
+ Optimize multiple pieces of content in batch
227
+
228
+ Args:
229
+ content_list (List[str]): List of content pieces to optimize
230
+ optimization_type (str): Type of optimization to apply
231
+
232
+ Returns:
233
+ List[Dict]: List of optimization results
234
+ """
235
+ results = []
236
+
237
+ for i, content in enumerate(content_list):
238
+ try:
239
+ result = self.optimize_content(
240
+ content,
241
+ optimization_type=optimization_type
242
+ )
243
+ result['batch_index'] = i
244
+ results.append(result)
245
+
246
+ except Exception as e:
247
+ results.append({
248
+ 'batch_index': i,
249
+ 'error': f"Batch optimization failed: {str(e)}"
250
+ })
251
+
252
+ return results
253
+
254
+ def generate_content_variations(self, content: str, num_variations: int = 3) -> List[Dict[str, Any]]:
255
+ """
256
+ Generate multiple optimized variations of the same content
257
+
258
+ Args:
259
+ content (str): Original content
260
+ num_variations (int): Number of variations to generate
261
+
262
+ Returns:
263
+ List[Dict]: List of content variations with analysis
264
+ """
265
+ variations = []
266
+
267
+ variation_prompts = [
268
+ "Create a more conversational version optimized for AI chat responses",
269
+ "Create a more authoritative version optimized for citations",
270
+ "Create a more structured version optimized for question-answering"
271
+ ]
272
+
273
+ for i in range(min(num_variations, len(variation_prompts))):
274
+ try:
275
+ custom_prompt = f"""You are optimizing content for AI systems. {variation_prompts[i]}.
276
+
277
+ Original content: {content[:4000]}
278
+
279
+ Provide the optimized variation in JSON format:
280
+ ```json
281
+ {{
282
+ "variation_type": "conversational/authoritative/structured",
283
+ "optimized_content": "the rewritten content...",
284
+ "key_changes": ["change 1", "change 2"],
285
+ "target_use_case": "description of ideal use case"
286
+ }}
287
+ ```"""
288
+
289
+ prompt_template = ChatPromptTemplate.from_messages([
290
+ ("system", custom_prompt),
291
+ ("user", "Generate the variation.")
292
+ ])
293
+
294
+ chain = prompt_template | self.llm
295
+ result = chain.invoke({})
296
+
297
+ result_content = result.content if hasattr(result, 'content') else str(result)
298
+ parsed_result = self._parse_optimization_result(result_content)
299
+
300
+ parsed_result.update({
301
+ 'variation_index': i,
302
+ 'variation_prompt': variation_prompts[i]
303
+ })
304
+
305
+ variations.append(parsed_result)
306
+
307
+ except Exception as e:
308
+ variations.append({
309
+ 'variation_index': i,
310
+ 'error': f"Variation generation failed: {str(e)}"
311
+ })
312
+
313
+ return variations
314
+
315
+ def analyze_content_readability(self, content: str) -> Dict[str, Any]:
316
+ """
317
+ Analyze content readability for AI systems
318
+
319
+ Args:
320
+ content (str): Content to analyze
321
+
322
+ Returns:
323
+ Dict: Readability analysis results
324
+ """
325
+ try:
326
+ # Basic readability metrics
327
+ words = content.split()
328
+ sentences = re.split(r'[.!?]+', content)
329
+ sentences = [s.strip() for s in sentences if s.strip()]
330
+
331
+ paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
332
+
333
+ # Calculate metrics
334
+ avg_words_per_sentence = len(words) / len(sentences) if sentences else 0
335
+ avg_sentences_per_paragraph = len(sentences) / len(paragraphs) if paragraphs else 0
336
+
337
+ # Character-based metrics
338
+ avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
339
+
340
+ # Complexity indicators
341
+ long_sentences = [s for s in sentences if len(s.split()) > 20]
342
+ complex_words = [w for w in words if len(w) > 6]
343
+
344
+ return {
345
+ 'basic_metrics': {
346
+ 'total_words': len(words),
347
+ 'total_sentences': len(sentences),
348
+ 'total_paragraphs': len(paragraphs),
349
+ 'avg_words_per_sentence': avg_words_per_sentence,
350
+ 'avg_sentences_per_paragraph': avg_sentences_per_paragraph,
351
+ 'avg_word_length': avg_word_length
352
+ },
353
+ 'complexity_indicators': {
354
+ 'long_sentences_count': len(long_sentences),
355
+ 'long_sentences_percentage': len(long_sentences) / len(sentences) * 100 if sentences else 0,
356
+ 'complex_words_count': len(complex_words),
357
+ 'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
358
+ },
359
+ 'ai_readability_score': self._calculate_ai_readability_score({
360
+ 'avg_words_per_sentence': avg_words_per_sentence,
361
+ 'avg_word_length': avg_word_length,
362
+ 'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
363
+ }),
364
+ 'recommendations': self._generate_readability_recommendations({
365
+ 'avg_words_per_sentence': avg_words_per_sentence,
366
+ 'long_sentences_percentage': len(long_sentences) / len(sentences) * 100 if sentences else 0,
367
+ 'complex_words_percentage': len(complex_words) / len(words) * 100 if words else 0
368
+ })
369
+ }
370
+
371
+ except Exception as e:
372
+ return {'error': f"Readability analysis failed: {str(e)}"}
373
+
374
+ def extract_key_entities(self, content: str) -> Dict[str, Any]:
375
+ """
376
+ Extract key entities and topics for optimization
377
+
378
+ Args:
379
+ content (str): Content to analyze
380
+
381
+ Returns:
382
+ Dict: Extracted entities and topics
383
+ """
384
+ try:
385
+ entity_prompt = """Extract key entities, topics, and concepts from this content for AI optimization.
386
+
387
+ Content: {content}
388
+
389
+ Identify:
390
+ 1. Named entities (people, places, organizations)
391
+ 2. Key concepts and topics
392
+ 3. Technical terms and jargon
393
+ 4. Potential semantic keywords
394
+ 5. Question-answer opportunities
395
+
396
+ Format as JSON:
397
+ ```json
398
+ {{
399
+ "named_entities": ["entity1", "entity2"],
400
+ "key_topics": ["topic1", "topic2"],
401
+ "technical_terms": ["term1", "term2"],
402
+ "semantic_keywords": ["keyword1", "keyword2"],
403
+ "question_opportunities": ["What is...", "How does..."],
404
+ "entity_relationships": ["relationship descriptions"]
405
+ }}
406
+ ```"""
407
+
408
+ prompt_template = ChatPromptTemplate.from_messages([
409
+ ("system", entity_prompt.format(content=content[:5000])),
410
+ ("user", "Extract the entities and topics.")
411
+ ])
412
+
413
+ chain = prompt_template | self.llm
414
+ result = chain.invoke({})
415
+
416
+ result_content = result.content if hasattr(result, 'content') else str(result)
417
+ return self._parse_optimization_result(result_content)
418
+
419
+ except Exception as e:
420
+ return {'error': f"Entity extraction failed: {str(e)}"}
421
+
422
+ def optimize_for_voice_search(self, content: str) -> Dict[str, Any]:
423
+ """
424
+ Optimize content specifically for voice search and conversational AI
425
+
426
+ Args:
427
+ content (str): Content to optimize
428
+
429
+ Returns:
430
+ Dict: Voice search optimization results
431
+ """
432
+ try:
433
+ voice_prompt = """Optimize this content for voice search and conversational AI systems.
434
+
435
+ Focus on:
436
+ 1. Natural language patterns
437
+ 2. Question-based structure
438
+ 3. Conversational tone
439
+ 4. Clear, direct answers
440
+ 5. Featured snippet optimization
441
+
442
+ Original content: {content}
443
+
444
+ Provide optimization in JSON:
445
+ ```json
446
+ {{
447
+ "voice_optimized_content": "conversational version...",
448
+ "question_answer_pairs": [
449
+ {{"question": "What is...", "answer": "Direct answer..."}},
450
+ {{"question": "How does...", "answer": "Step by step..."}}
451
+ ],
452
+ "featured_snippet_candidates": ["snippet 1", "snippet 2"],
453
+ "natural_language_improvements": ["improvement 1", "improvement 2"],
454
+ "conversational_score": 8.5
455
+ }}
456
+ ```"""
457
+
458
+ prompt_template = ChatPromptTemplate.from_messages([
459
+ ("system", voice_prompt.format(content=content[:4000])),
460
+ ("user", "Optimize for voice search.")
461
+ ])
462
+
463
+ chain = prompt_template | self.llm
464
+ result = chain.invoke({})
465
+
466
+ result_content = result.content if hasattr(result, 'content') else str(result)
467
+ parsed_result = self._parse_optimization_result(result_content)
468
+
469
+ parsed_result.update({
470
+ 'optimization_type': 'voice_search',
471
+ 'voice_optimized': True
472
+ })
473
+
474
+ return parsed_result
475
+
476
+ except Exception as e:
477
+ return {'error': f"Voice search optimization failed: {str(e)}"}
478
+
479
+ def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]:
480
+ """Parse LLM response and extract structured results"""
481
+ try:
482
+ # Find JSON content in the response
483
+ json_start = response_text.find('{')
484
+ json_end = response_text.rfind('}') + 1
485
+
486
+ if json_start != -1 and json_end != -1:
487
+ json_str = response_text[json_start:json_end]
488
+ parsed = json.loads(json_str)
489
+
490
+ # Ensure consistent structure
491
+ if 'scores' not in parsed and 'score' in parsed:
492
+ parsed['scores'] = parsed['score']
493
+
494
+ return parsed
495
+ else:
496
+ # If no JSON found, return raw response with error flag
497
+ return {
498
+ 'raw_response': response_text,
499
+ 'parsing_error': 'No JSON structure found in response',
500
+ 'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
501
+ }
502
+
503
+ except json.JSONDecodeError as e:
504
+ return {
505
+ 'raw_response': response_text,
506
+ 'parsing_error': f'JSON decode error: {str(e)}',
507
+ 'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
508
+ }
509
+ except Exception as e:
510
+ return {
511
+ 'raw_response': response_text,
512
+ 'parsing_error': f'Unexpected parsing error: {str(e)}',
513
+ 'scores': {'clarity': 0, 'structuredness': 0, 'answerability': 0}
514
+ }
515
+
516
+ def _calculate_ai_readability_score(self, metrics: Dict[str, float]) -> float:
517
+ """Calculate AI-specific readability score"""
518
+ try:
519
+ # Optimal ranges for AI consumption
520
+ optimal_words_per_sentence = 15 # Sweet spot for AI processing
521
+ optimal_word_length = 5 # Balance of complexity and clarity
522
+ optimal_complex_words_percentage = 15 # Some complexity is good for authority
523
+
524
+ # Calculate deviations from optimal
525
+ sentence_score = max(0, 10 - abs(metrics['avg_words_per_sentence'] - optimal_words_per_sentence) * 0.5)
526
+ word_length_score = max(0, 10 - abs(metrics['avg_word_length'] - optimal_word_length) * 2)
527
+ complexity_score = max(0, 10 - abs(metrics['complex_words_percentage'] - optimal_complex_words_percentage) * 0.3)
528
+
529
+ # Weighted average
530
+ overall_score = (sentence_score * 0.4 + word_length_score * 0.3 + complexity_score * 0.3)
531
+
532
+ return round(overall_score, 1)
533
+
534
+ except Exception:
535
+ return 5.0 # Default neutral score
536
+
537
+ def _generate_readability_recommendations(self, metrics: Dict[str, float]) -> List[str]:
538
+ """Generate specific readability improvement recommendations"""
539
+ recommendations = []
540
+
541
+ try:
542
+ if metrics['avg_words_per_sentence'] > 20:
543
+ recommendations.append("Break down long sentences for better AI processing")
544
+ elif metrics['avg_words_per_sentence'] < 8:
545
+ recommendations.append("Consider combining very short sentences for better context")
546
+
547
+ if metrics['long_sentences_percentage'] > 30:
548
+ recommendations.append("Reduce the number of complex sentences (>20 words)")
549
+
550
+ if metrics['complex_words_percentage'] > 25:
551
+ recommendations.append("Simplify vocabulary where possible for broader accessibility")
552
+ elif metrics['complex_words_percentage'] < 5:
553
+ recommendations.append("Add more specific terminology to establish authority")
554
+
555
+ return recommendations
556
+
557
+ except Exception:
558
+ return ["Unable to generate specific recommendations"]
utils/parser.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Content Parsing Module
3
+ Handles extraction of content from PDFs, text, and webpages
4
+ """
5
+
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from urllib.parse import urljoin, urlparse
9
+ from typing import List, Dict, Any
10
+ import time
11
+ from langchain_community.document_loaders import PyPDFLoader
12
+ from langchain.schema import Document
13
+
14
+
15
+ class BaseParser:
16
+ """Base class for all content parsers"""
17
+
18
+ def __init__(self):
19
+ self.supported_formats = []
20
+
21
+ def parse(self, source: str) -> List[Document]:
22
+ """Parse content from source and return LangChain Documents"""
23
+ raise NotImplementedError("Subclasses must implement parse method")
24
+
25
+ def validate_source(self, source: str) -> bool:
26
+ """Validate if the source can be processed"""
27
+ return True
28
+
29
+
30
+ class PDFParser(BaseParser):
31
+ """Parser for PDF documents"""
32
+
33
+ def __init__(self):
34
+ super().__init__()
35
+ self.supported_formats = ['.pdf']
36
+
37
+ def parse(self, pdf_path: str) -> List[Document]:
38
+ """
39
+ Parse PDF file and return list of Document objects
40
+
41
+ Args:
42
+ pdf_path (str): Path to the PDF file
43
+
44
+ Returns:
45
+ List[Document]: List of parsed documents with metadata
46
+ """
47
+ try:
48
+ loader = PyPDFLoader(pdf_path)
49
+ documents = loader.load_and_split()
50
+
51
+ # Add additional metadata
52
+ for i, doc in enumerate(documents):
53
+ doc.metadata.update({
54
+ 'source_type': 'pdf',
55
+ 'page_number': i + 1,
56
+ 'total_pages': len(documents),
57
+ 'parser': 'PDFParser'
58
+ })
59
+
60
+ return documents
61
+
62
+ except Exception as e:
63
+ raise Exception(f"Error parsing PDF: {str(e)}")
64
+
65
+ def get_pdf_metadata(self, pdf_path: str) -> Dict[str, Any]:
66
+ """Extract metadata from PDF file"""
67
+ try:
68
+ loader = PyPDFLoader(pdf_path)
69
+ documents = loader.load()
70
+
71
+ total_pages = len(documents)
72
+ total_words = sum(len(doc.page_content.split()) for doc in documents)
73
+
74
+ return {
75
+ 'total_pages': total_pages,
76
+ 'total_words': total_words,
77
+ 'average_words_per_page': total_words / total_pages if total_pages > 0 else 0,
78
+ 'file_type': 'PDF',
79
+ 'parser_used': 'PyPDFLoader'
80
+ }
81
+
82
+ except Exception as e:
83
+ return {'error': f"Could not extract metadata: {str(e)}"}
84
+
85
+
86
+ class TextParser(BaseParser):
87
+ """Parser for plain text content"""
88
+
89
+ def __init__(self):
90
+ super().__init__()
91
+ self.supported_formats = ['.txt', 'plain_text']
92
+ self.chunk_size = 1000 # Default chunk size for long texts
93
+
94
+ def parse(self, text_content: str, chunk_size: int = None) -> List[Document]:
95
+ """
96
+ Parse text content and return list of Document objects
97
+
98
+ Args:
99
+ text_content (str): Raw text content
100
+ chunk_size (int): Optional chunk size for splitting long texts
101
+
102
+ Returns:
103
+ List[Document]: List of documents, potentially chunked
104
+ """
105
+ try:
106
+ if not text_content.strip():
107
+ raise ValueError("Empty text content provided")
108
+
109
+ chunk_size = chunk_size or self.chunk_size
110
+
111
+ # If text is short, return as single document
112
+ if len(text_content) <= chunk_size:
113
+ doc = Document(
114
+ page_content=text_content,
115
+ metadata={
116
+ 'source_type': 'text',
117
+ 'word_count': len(text_content.split()),
118
+ 'char_count': len(text_content),
119
+ 'chunk_index': 0,
120
+ 'total_chunks': 1,
121
+ 'parser': 'TextParser'
122
+ }
123
+ )
124
+ return [doc]
125
+
126
+ # Split long text into chunks
127
+ chunks = self._split_text_into_chunks(text_content, chunk_size)
128
+ documents = []
129
+
130
+ for i, chunk in enumerate(chunks):
131
+ doc = Document(
132
+ page_content=chunk,
133
+ metadata={
134
+ 'source_type': 'text',
135
+ 'word_count': len(chunk.split()),
136
+ 'char_count': len(chunk),
137
+ 'chunk_index': i,
138
+ 'total_chunks': len(chunks),
139
+ 'parser': 'TextParser'
140
+ }
141
+ )
142
+ documents.append(doc)
143
+
144
+ return documents
145
+
146
+ except Exception as e:
147
+ raise Exception(f"Error parsing text: {str(e)}")
148
+
149
+ def _split_text_into_chunks(self, text: str, chunk_size: int) -> List[str]:
150
+ """Split text into chunks while preserving sentence boundaries"""
151
+ sentences = text.split('. ')
152
+ chunks = []
153
+ current_chunk = ""
154
+
155
+ for sentence in sentences:
156
+ # Add sentence to current chunk if it fits
157
+ test_chunk = current_chunk + sentence + ". "
158
+
159
+ if len(test_chunk) <= chunk_size:
160
+ current_chunk = test_chunk
161
+ else:
162
+ # Start new chunk if current chunk has content
163
+ if current_chunk.strip():
164
+ chunks.append(current_chunk.strip())
165
+ current_chunk = sentence + ". "
166
+
167
+ # Add final chunk if it has content
168
+ if current_chunk.strip():
169
+ chunks.append(current_chunk.strip())
170
+
171
+ return chunks
172
+
173
+ def analyze_text_structure(self, text_content: str) -> Dict[str, Any]:
174
+ """Analyze the structure and characteristics of text content"""
175
+ try:
176
+ lines = text_content.split('\n')
177
+ words = text_content.split()
178
+ sentences = text_content.split('.')
179
+
180
+ # Count different elements
181
+ paragraphs = [p.strip() for p in text_content.split('\n\n') if p.strip()]
182
+
183
+ return {
184
+ 'total_words': len(words),
185
+ 'total_sentences': len([s for s in sentences if s.strip()]),
186
+ 'total_lines': len(lines),
187
+ 'total_paragraphs': len(paragraphs),
188
+ 'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
189
+ 'average_sentences_per_paragraph': len(sentences) / len(paragraphs) if paragraphs else 0,
190
+ 'character_count': len(text_content),
191
+ 'reading_time_minutes': len(words) / 200, # Assuming 200 words per minute
192
+ 'complexity_score': self._calculate_text_complexity(text_content)
193
+ }
194
+
195
+ except Exception as e:
196
+ return {'error': f"Could not analyze text structure: {str(e)}"}
197
+
198
+ def _calculate_text_complexity(self, text: str) -> float:
199
+ """Calculate a simple text complexity score"""
200
+ words = text.split()
201
+ sentences = [s for s in text.split('.') if s.strip()]
202
+
203
+ if not sentences:
204
+ return 0.0
205
+
206
+ # Average words per sentence (higher = more complex)
207
+ avg_words_per_sentence = len(words) / len(sentences)
208
+
209
+ # Average characters per word (higher = more complex)
210
+ avg_chars_per_word = sum(len(word) for word in words) / len(words) if words else 0
211
+
212
+ # Simple complexity score (normalized to 1-10 scale)
213
+ complexity = (avg_words_per_sentence * 0.1) + (avg_chars_per_word * 0.5)
214
+ return min(complexity, 10.0)
215
+
216
+
217
+ class WebpageParser(BaseParser):
218
+ """Parser for web content"""
219
+
220
+ def __init__(self):
221
+ super().__init__()
222
+ self.supported_formats = ['http', 'https']
223
+ self.headers = {
224
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
225
+ }
226
+ self.timeout = 10
227
+ self.max_retries = 3
228
+
229
+ def parse_website(self, url: str, max_pages: int = 1, include_subpages: bool = False) -> List[Dict[str, Any]]:
230
+ """
231
+ Parse website content and return structured data
232
+
233
+ Args:
234
+ url (str): Website URL to parse
235
+ max_pages (int): Maximum number of pages to parse
236
+ include_subpages (bool): Whether to include subpages
237
+
238
+ Returns:
239
+ List[Dict]: List of page data with content and metadata
240
+ """
241
+ try:
242
+ pages_data = []
243
+ urls_to_process = [url]
244
+ processed_urls = set()
245
+
246
+ # If including subpages, find additional URLs
247
+ if include_subpages and max_pages > 1:
248
+ subpage_urls = self._find_subpages(url, max_pages - 1)
249
+ urls_to_process.extend(subpage_urls)
250
+
251
+ # Process each URL
252
+ for current_url in urls_to_process[:max_pages]:
253
+ if current_url in processed_urls:
254
+ continue
255
+
256
+ page_data = self._parse_single_page(current_url)
257
+ if page_data:
258
+ pages_data.append(page_data)
259
+ processed_urls.add(current_url)
260
+
261
+ # Add small delay to be respectful
262
+ time.sleep(1)
263
+
264
+ return pages_data
265
+
266
+ except Exception as e:
267
+ raise Exception(f"Error parsing website: {str(e)}")
268
+
269
+ def _parse_single_page(self, url: str) -> Dict[str, Any]:
270
+ """Parse a single webpage and extract content"""
271
+ try:
272
+ # Make request with retries
273
+ response = None
274
+ for attempt in range(self.max_retries):
275
+ try:
276
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
277
+ response.raise_for_status()
278
+ break
279
+ except requests.RequestException as e:
280
+ if attempt == self.max_retries - 1:
281
+ raise e
282
+ time.sleep(2 ** attempt) # Exponential backoff
283
+
284
+ if not response:
285
+ return None
286
+
287
+ # Parse HTML content
288
+ soup = BeautifulSoup(response.content, 'html.parser')
289
+
290
+ # Remove unwanted elements
291
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
292
+ element.decompose()
293
+
294
+ # Extract main content
295
+ main_content = self._extract_main_content(soup)
296
+
297
+ # Extract metadata
298
+ title = self._extract_title(soup)
299
+ description = self._extract_description(soup)
300
+ headings = self._extract_headings(soup)
301
+ links = self._extract_links(soup, url)
302
+
303
+ # Clean and process text
304
+ cleaned_text = self._clean_text_content(main_content)
305
+
306
+ return {
307
+ 'url': url,
308
+ 'title': title,
309
+ 'description': description,
310
+ 'content': cleaned_text,
311
+ 'headings': headings,
312
+ 'internal_links': links['internal'],
313
+ 'external_links': links['external'],
314
+ 'word_count': len(cleaned_text.split()),
315
+ 'char_count': len(cleaned_text),
316
+ 'meta_keywords': self._extract_meta_keywords(soup),
317
+ 'images': self._extract_images(soup, url),
318
+ 'parser': 'WebpageParser',
319
+ 'parsed_at': time.strftime('%Y-%m-%d %H:%M:%S')
320
+ }
321
+
322
+ except Exception as e:
323
+ return {'url': url, 'error': f"Failed to parse page: {str(e)}"}
324
+
325
+ def _extract_main_content(self, soup: BeautifulSoup) -> str:
326
+ """Extract the main content from the page"""
327
+ # Try to find main content in order of preference
328
+ content_selectors = [
329
+ 'main',
330
+ 'article',
331
+ '[role="main"]',
332
+ '.content',
333
+ '.main-content',
334
+ '#content',
335
+ '#main',
336
+ '.post-content',
337
+ '.entry-content'
338
+ ]
339
+
340
+ for selector in content_selectors:
341
+ element = soup.select_one(selector)
342
+ if element:
343
+ return element.get_text(separator=' ', strip=True)
344
+
345
+ # Fallback to body content
346
+ body = soup.find('body')
347
+ if body:
348
+ return body.get_text(separator=' ', strip=True)
349
+
350
+ return soup.get_text(separator=' ', strip=True)
351
+
352
+ def _extract_title(self, soup: BeautifulSoup) -> str:
353
+ """Extract page title"""
354
+ title_tag = soup.find('title')
355
+ if title_tag:
356
+ return title_tag.get_text().strip()
357
+
358
+ # Fallback to h1
359
+ h1 = soup.find('h1')
360
+ if h1:
361
+ return h1.get_text().strip()
362
+
363
+ return "No Title Found"
364
+
365
+ def _extract_description(self, soup: BeautifulSoup) -> str:
366
+ """Extract meta description"""
367
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
368
+ if meta_desc and meta_desc.get('content'):
369
+ return meta_desc['content'].strip()
370
+
371
+ # Fallback to Open Graph description
372
+ og_desc = soup.find('meta', attrs={'property': 'og:description'})
373
+ if og_desc and og_desc.get('content'):
374
+ return og_desc['content'].strip()
375
+
376
+ return "No Description Found"
377
+
378
+ def _extract_headings(self, soup: BeautifulSoup) -> List[Dict[str, Any]]:
379
+ """Extract all headings with their hierarchy"""
380
+ headings = []
381
+
382
+ for i in range(1, 7): # h1 to h6
383
+ for heading in soup.find_all(f'h{i}'):
384
+ text = heading.get_text(strip=True)
385
+ if text:
386
+ headings.append({
387
+ 'level': i,
388
+ 'text': text,
389
+ 'id': heading.get('id', ''),
390
+ 'class': heading.get('class', [])
391
+ })
392
+
393
+ return headings
394
+
395
+ def _extract_links(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[str]]:
396
+ """Extract internal and external links"""
397
+ internal_links = []
398
+ external_links = []
399
+ base_domain = urlparse(base_url).netloc
400
+
401
+ for link in soup.find_all('a', href=True):
402
+ href = link['href']
403
+ full_url = urljoin(base_url, href)
404
+ parsed_url = urlparse(full_url)
405
+
406
+ if parsed_url.netloc == base_domain:
407
+ internal_links.append(full_url)
408
+ elif parsed_url.netloc: # External link with domain
409
+ external_links.append(full_url)
410
+
411
+ return {
412
+ 'internal': list(set(internal_links)),
413
+ 'external': list(set(external_links))
414
+ }
415
+
416
+ def _extract_meta_keywords(self, soup: BeautifulSoup) -> List[str]:
417
+ """Extract meta keywords if available"""
418
+ meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
419
+ if meta_keywords and meta_keywords.get('content'):
420
+ keywords = meta_keywords['content'].split(',')
421
+ return [kw.strip() for kw in keywords if kw.strip()]
422
+ return []
423
+
424
+ def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[Dict[str, str]]:
425
+ """Extract image information"""
426
+ images = []
427
+
428
+ for img in soup.find_all('img'):
429
+ src = img.get('src')
430
+ if src:
431
+ full_url = urljoin(base_url, src)
432
+ images.append({
433
+ 'src': full_url,
434
+ 'alt': img.get('alt', ''),
435
+ 'title': img.get('title', '')
436
+ })
437
+
438
+ return images
439
+
440
+ def _clean_text_content(self, text: str) -> str:
441
+ """Clean and normalize text content"""
442
+ if not text:
443
+ return ""
444
+
445
+ # Split into lines and clean each line
446
+ lines = text.split('\n')
447
+ cleaned_lines = []
448
+
449
+ for line in lines:
450
+ line = line.strip()
451
+ if line and len(line) > 1: # Skip empty lines and single characters
452
+ cleaned_lines.append(line)
453
+
454
+ # Join lines with single spaces
455
+ cleaned_text = ' '.join(cleaned_lines)
456
+
457
+ # Remove multiple spaces
458
+ while ' ' in cleaned_text:
459
+ cleaned_text = cleaned_text.replace(' ', ' ')
460
+
461
+ return cleaned_text
462
+
463
+ def _find_subpages(self, url: str, max_subpages: int) -> List[str]:
464
+ """Find subpages from the main page"""
465
+ try:
466
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
467
+ response.raise_for_status()
468
+
469
+ soup = BeautifulSoup(response.content, 'html.parser')
470
+ base_domain = urlparse(url).netloc
471
+ subpages = set()
472
+
473
+ # Find internal links
474
+ for link in soup.find_all('a', href=True):
475
+ href = link['href']
476
+ full_url = urljoin(url, href)
477
+ parsed_url = urlparse(full_url)
478
+
479
+ # Only include internal links from same domain
480
+ if (parsed_url.netloc == base_domain and
481
+ full_url != url and
482
+ not any(ext in full_url.lower() for ext in ['.pdf', '.jpg', '.png', '.gif', '.zip'])):
483
+ subpages.add(full_url)
484
+
485
+ if len(subpages) >= max_subpages:
486
+ break
487
+
488
+ return list(subpages)[:max_subpages]
489
+
490
+ except Exception:
491
+ return []
492
+
493
+ def validate_url(self, url: str) -> bool:
494
+ """Validate if URL is accessible"""
495
+ try:
496
+ response = requests.head(url, headers=self.headers, timeout=5)
497
+ return response.status_code == 200
498
+ except:
499
+ return False
500
+
501
+ def get_website_info(self, url: str) -> Dict[str, Any]:
502
+ """Get basic information about a website"""
503
+ try:
504
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
505
+ response.raise_for_status()
506
+
507
+ soup = BeautifulSoup(response.content, 'html.parser')
508
+
509
+ return {
510
+ 'url': url,
511
+ 'title': self._extract_title(soup),
512
+ 'description': self._extract_description(soup),
513
+ 'meta_keywords': self._extract_meta_keywords(soup),
514
+ 'has_robots_meta': bool(soup.find('meta', attrs={'name': 'robots'})),
515
+ 'has_viewport_meta': bool(soup.find('meta', attrs={'name': 'viewport'})),
516
+ 'language': soup.get('lang', 'unknown'),
517
+ 'status_code': response.status_code,
518
+ 'content_type': response.headers.get('content-type', 'unknown'),
519
+ 'server': response.headers.get('server', 'unknown')
520
+ }
521
+
522
+ except Exception as e:
523
+ return {'url': url, 'error': f"Could not get website info: {str(e)}"}
524
+
525
+
526
+ class ParserFactory:
527
+ """Factory class to create appropriate parsers"""
528
+
529
+ @staticmethod
530
+ def get_parser(source_type: str):
531
+ """Get the appropriate parser for the source type"""
532
+ parsers = {
533
+ 'pdf': PDFParser(),
534
+ 'text': TextParser(),
535
+ 'webpage': WebpageParser(),
536
+ 'url': WebpageParser()
537
+ }
538
+
539
+ return parsers.get(source_type.lower())
540
+
541
+ @staticmethod
542
+ def detect_source_type(source: str) -> str:
543
+ """Detect the type of content source"""
544
+ if source.startswith(('http://', 'https://')):
545
+ return 'webpage'
546
+ elif source.endswith('.pdf'):
547
+ return 'pdf'
548
+ else:
549
+ return 'text'
utils/scorer.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GEO Scoring Module
3
+ Analyzes content for Generative Engine Optimization (GEO) performance
4
+ """
5
+
6
+ import json
7
+ from typing import Dict, Any, List
8
+ from langchain.prompts import ChatPromptTemplate
9
+
10
+
11
+ class GEOScorer:
12
+ """Main class for calculating GEO scores and analysis"""
13
+
14
+ def __init__(self, llm):
15
+ self.llm = llm
16
+ self.setup_prompts()
17
+
18
+ def setup_prompts(self):
19
+ """Initialize prompts for different types of analysis"""
20
+
21
+ # Main GEO analysis prompt
22
+ self.geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided content for its effectiveness in AI-powered search engines and LLM systems.
23
+
24
+ Evaluate the content based on these GEO criteria (score 1-10 each):
25
+
26
+ 1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
27
+ 2. **Query Intent Matching**: How well does the content match common user queries?
28
+ 3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
29
+ 4. **Conversational Readiness**: How suitable is the content for AI chat responses?
30
+ 5. **Semantic Richness**: How well does the content use relevant semantic keywords?
31
+ 6. **Context Completeness**: Does the content provide complete, self-contained answers?
32
+ 7. **Citation Worthiness**: How likely are AI systems to cite this content?
33
+ 8. **Multi-Query Coverage**: Does the content answer multiple related questions?
34
+
35
+ Also identify:
36
+ - Primary topics and entities
37
+ - Missing information gaps
38
+ - Optimization opportunities
39
+ - Specific enhancement recommendations
40
+
41
+ Format your response as JSON:
42
+
43
+ ```json
44
+ {
45
+ "geo_scores": {
46
+ "ai_search_visibility": 7.5,
47
+ "query_intent_matching": 8.0,
48
+ "factual_accuracy": 9.0,
49
+ "conversational_readiness": 6.5,
50
+ "semantic_richness": 7.0,
51
+ "context_completeness": 8.5,
52
+ "citation_worthiness": 7.8,
53
+ "multi_query_coverage": 6.0
54
+ },
55
+ "overall_geo_score": 7.5,
56
+ "primary_topics": ["topic1", "topic2"],
57
+ "entities": ["entity1", "entity2"],
58
+ "missing_gaps": ["gap1", "gap2"],
59
+ "optimization_opportunities": [
60
+ {
61
+ "type": "semantic_enhancement",
62
+ "description": "Add more related terms",
63
+ "priority": "high"
64
+ }
65
+ ],
66
+ "recommendations": [
67
+ "Specific actionable recommendation 1",
68
+ "Specific actionable recommendation 2"
69
+ ]
70
+ }
71
+ ```"""
72
+
73
+ # Quick scoring prompt for faster analysis
74
+ self.quick_score_prompt = """Analyze this content for AI search optimization. Provide scores (1-10) for:
75
+
76
+ 1. AI Search Visibility
77
+ 2. Query Intent Matching
78
+ 3. Conversational Readiness
79
+ 4. Citation Worthiness
80
+
81
+ Respond in JSON format:
82
+ ```json
83
+ {
84
+ "scores": {
85
+ "ai_search_visibility": 7.5,
86
+ "query_intent_matching": 8.0,
87
+ "conversational_readiness": 6.5,
88
+ "citation_worthiness": 7.8
89
+ },
90
+ "overall_score": 7.5,
91
+ "top_recommendation": "Most important improvement needed"
92
+ }
93
+ ```"""
94
+
95
+ # Competitive analysis prompt
96
+ self.competitive_prompt = """Compare these content pieces for GEO performance. Identify which performs better for AI search and why.
97
+
98
+ Content A: {content_a}
99
+
100
+ Content B: {content_b}
101
+
102
+ Provide analysis in JSON:
103
+ ```json
104
+ {
105
+ "winner": "A" or "B",
106
+ "score_comparison": {
107
+ "content_a_score": 7.5,
108
+ "content_b_score": 8.2
109
+ },
110
+ "key_differences": ["difference1", "difference2"],
111
+ "improvement_suggestions": {
112
+ "content_a": ["suggestion1"],
113
+ "content_b": ["suggestion1"]
114
+ }
115
+ }
116
+ ```"""
117
+
118
+ def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
119
+ """
120
+ Analyze a single page for GEO performance
121
+
122
+ Args:
123
+ content (str): Page content to analyze
124
+ title (str): Page title
125
+ detailed (bool): Whether to perform detailed analysis
126
+
127
+ Returns:
128
+ Dict: GEO analysis results
129
+ """
130
+ try:
131
+ # Choose prompt based on detail level
132
+ if detailed:
133
+ prompt_template = ChatPromptTemplate.from_messages([
134
+ ("system", self.geo_analysis_prompt),
135
+ ("user", f"Title: {title}\n\nContent: {content[:8000]}") # Limit content length
136
+ ])
137
+ else:
138
+ prompt_template = ChatPromptTemplate.from_messages([
139
+ ("system", self.quick_score_prompt),
140
+ ("user", f"Title: {title}\n\nContent: {content[:4000]}")
141
+ ])
142
+
143
+ # Run analysis
144
+ chain = prompt_template | self.llm
145
+ result = chain.invoke({})
146
+
147
+ # Extract and parse result
148
+ result_content = result.content if hasattr(result, 'content') else str(result)
149
+ parsed_result = self._parse_llm_response(result_content)
150
+
151
+ # Add metadata
152
+ parsed_result.update({
153
+ 'analyzed_title': title,
154
+ 'content_length': len(content),
155
+ 'word_count': len(content.split()),
156
+ 'analysis_type': 'detailed' if detailed else 'quick'
157
+ })
158
+
159
+ return parsed_result
160
+
161
+ except Exception as e:
162
+ return {'error': f"GEO analysis failed: {str(e)}"}
163
+
164
+ def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
165
+ """
166
+ Analyze multiple pages and return consolidated results
167
+
168
+ Args:
169
+ pages_data (List[Dict]): List of page data with content and metadata
170
+ detailed (bool): Whether to perform detailed analysis
171
+
172
+ Returns:
173
+ List[Dict]: List of GEO analysis results
174
+ """
175
+ results = []
176
+
177
+ for i, page_data in enumerate(pages_data):
178
+ try:
179
+ content = page_data.get('content', '')
180
+ title = page_data.get('title', f'Page {i+1}')
181
+
182
+ analysis = self.analyze_page_geo(content, title, detailed)
183
+
184
+ # Add page-specific metadata
185
+ analysis.update({
186
+ 'page_url': page_data.get('url', ''),
187
+ 'page_index': i,
188
+ 'source_word_count': page_data.get('word_count', 0)
189
+ })
190
+
191
+ results.append(analysis)
192
+
193
+ except Exception as e:
194
+ results.append({
195
+ 'page_index': i,
196
+ 'page_url': page_data.get('url', ''),
197
+ 'error': f"Analysis failed: {str(e)}"
198
+ })
199
+
200
+ return results
201
+
202
+ def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
203
+ """
204
+ Compare two pieces of content for GEO performance
205
+
206
+ Args:
207
+ content_a (str): First content to compare
208
+ content_b (str): Second content to compare
209
+ titles (tuple): Optional titles for the content pieces
210
+
211
+ Returns:
212
+ Dict: Comparison analysis results
213
+ """
214
+ try:
215
+ title_a, title_b = titles if titles else ("Content A", "Content B")
216
+
217
+ prompt_template = ChatPromptTemplate.from_messages([
218
+ ("system", self.competitive_prompt),
219
+ ("user", "")
220
+ ])
221
+
222
+ # Format the competitive analysis prompt
223
+ formatted_prompt = self.competitive_prompt.format(
224
+ content_a=f"Title: {title_a}\nContent: {content_a[:4000]}",
225
+ content_b=f"Title: {title_b}\nContent: {content_b[:4000]}"
226
+ )
227
+
228
+ chain = ChatPromptTemplate.from_messages([
229
+ ("system", formatted_prompt),
230
+ ("user", "Perform the comparison analysis.")
231
+ ]) | self.llm
232
+
233
+ result = chain.invoke({})
234
+ result_content = result.content if hasattr(result, 'content') else str(result)
235
+
236
+ return self._parse_llm_response(result_content)
237
+
238
+ except Exception as e:
239
+ return {'error': f"Comparison analysis failed: {str(e)}"}
240
+
241
+ def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
242
+ """
243
+ Calculate aggregate GEO scores from multiple page analyses
244
+
245
+ Args:
246
+ individual_results (List[Dict]): List of individual page analysis results
247
+
248
+ Returns:
249
+ Dict: Aggregate scores and insights
250
+ """
251
+ try:
252
+ valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
253
+
254
+ if not valid_results:
255
+ return {'error': 'No valid results to aggregate'}
256
+
257
+ # Calculate average scores
258
+ score_keys = list(valid_results[0]['geo_scores'].keys())
259
+ avg_scores = {}
260
+
261
+ for key in score_keys:
262
+ scores = [r['geo_scores'][key] for r in valid_results if key in r['geo_scores']]
263
+ avg_scores[key] = sum(scores) / len(scores) if scores else 0
264
+
265
+ overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
266
+
267
+ # Collect all recommendations and opportunities
268
+ all_recommendations = []
269
+ all_opportunities = []
270
+ all_topics = []
271
+ all_entities = []
272
+
273
+ for result in valid_results:
274
+ all_recommendations.extend(result.get('recommendations', []))
275
+ all_opportunities.extend(result.get('optimization_opportunities', []))
276
+ all_topics.extend(result.get('primary_topics', []))
277
+ all_entities.extend(result.get('entities', []))
278
+
279
+ # Remove duplicates and prioritize
280
+ unique_recommendations = list(set(all_recommendations))
281
+ unique_topics = list(set(all_topics))
282
+ unique_entities = list(set(all_entities))
283
+
284
+ # Find highest and lowest performing areas
285
+ best_score = max(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
286
+ worst_score = min(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
287
+
288
+ return {
289
+ 'aggregate_scores': avg_scores,
290
+ 'overall_score': overall_avg,
291
+ 'pages_analyzed': len(valid_results),
292
+ 'best_performing_metric': {
293
+ 'metric': best_score[0],
294
+ 'score': best_score[1]
295
+ },
296
+ 'lowest_performing_metric': {
297
+ 'metric': worst_score[0],
298
+ 'score': worst_score[1]
299
+ },
300
+ 'consolidated_recommendations': unique_recommendations[:10],
301
+ 'all_topics': unique_topics,
302
+ 'all_entities': unique_entities,
303
+ 'high_priority_opportunities': [
304
+ opp for opp in all_opportunities
305
+ if opp.get('priority') == 'high'
306
+ ][:5],
307
+ 'score_distribution': self._calculate_score_distribution(avg_scores)
308
+ }
309
+
310
+ except Exception as e:
311
+ return {'error': f"Aggregation failed: {str(e)}"}
312
+
313
+ def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
314
+ """
315
+ Generate a comprehensive GEO report
316
+
317
+ Args:
318
+ analysis_results (Dict): Results from aggregate analysis
319
+ website_url (str): Optional website URL for context
320
+
321
+ Returns:
322
+ Dict: Comprehensive GEO report
323
+ """
324
+ try:
325
+ report = {
326
+ 'report_metadata': {
327
+ 'generated_at': self._get_timestamp(),
328
+ 'website_url': website_url,
329
+ 'analysis_type': 'GEO Performance Report'
330
+ },
331
+ 'executive_summary': self._generate_executive_summary(analysis_results),
332
+ 'detailed_scores': analysis_results.get('aggregate_scores', {}),
333
+ 'performance_insights': self._generate_performance_insights(analysis_results),
334
+ 'actionable_recommendations': self._prioritize_recommendations(
335
+ analysis_results.get('consolidated_recommendations', [])
336
+ ),
337
+ 'optimization_roadmap': self._create_optimization_roadmap(analysis_results),
338
+ 'competitive_position': self._assess_competitive_position(analysis_results),
339
+ 'technical_details': {
340
+ 'pages_analyzed': analysis_results.get('pages_analyzed', 0),
341
+ 'overall_score': analysis_results.get('overall_score', 0),
342
+ 'score_distribution': analysis_results.get('score_distribution', {})
343
+ }
344
+ }
345
+
346
+ return report
347
+
348
+ except Exception as e:
349
+ return {'error': f"Report generation failed: {str(e)}"}
350
+
351
+ def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
352
+ """Parse LLM response and extract JSON content"""
353
+ try:
354
+ # Find JSON content in the response
355
+ json_start = response_text.find('{')
356
+ json_end = response_text.rfind('}') + 1
357
+
358
+ if json_start != -1 and json_end != -1:
359
+ json_str = response_text[json_start:json_end]
360
+ return json.loads(json_str)
361
+ else:
362
+ # If no JSON found, return the raw response
363
+ return {'raw_response': response_text, 'parsing_error': 'No JSON found'}
364
+
365
+ except json.JSONDecodeError as e:
366
+ return {'raw_response': response_text, 'parsing_error': f'JSON decode error: {str(e)}'}
367
+ except Exception as e:
368
+ return {'raw_response': response_text, 'parsing_error': f'Unexpected error: {str(e)}'}
369
+
370
+ def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
371
+ """Calculate distribution of scores for insights"""
372
+ if not scores:
373
+ return {}
374
+
375
+ score_values = list(scores.values())
376
+
377
+ return {
378
+ 'highest_score': max(score_values),
379
+ 'lowest_score': min(score_values),
380
+ 'average_score': sum(score_values) / len(score_values),
381
+ 'score_range': max(score_values) - min(score_values),
382
+ 'scores_above_7': len([s for s in score_values if s >= 7.0]),
383
+ 'scores_below_5': len([s for s in score_values if s < 5.0])
384
+ }
385
+
386
+ def _generate_executive_summary(self, analysis_results: Dict[str, Any]) -> str:
387
+ """Generate executive summary based on analysis results"""
388
+ overall_score = analysis_results.get('overall_score', 0)
389
+ pages_analyzed = analysis_results.get('pages_analyzed', 0)
390
+
391
+ if overall_score >= 8.0:
392
+ performance = "excellent"
393
+ elif overall_score >= 6.5:
394
+ performance = "good"
395
+ elif overall_score >= 5.0:
396
+ performance = "moderate"
397
+ else:
398
+ performance = "needs improvement"
399
+
400
+ return f"Analysis of {pages_analyzed} pages shows {performance} GEO performance with an overall score of {overall_score:.1f}/10. Key opportunities exist in {analysis_results.get('lowest_performing_metric', {}).get('metric', 'multiple areas')}."
401
+
402
+ def _generate_performance_insights(self, analysis_results: Dict[str, Any]) -> List[str]:
403
+ """Generate performance insights based on analysis"""
404
+ insights = []
405
+
406
+ best_metric = analysis_results.get('best_performing_metric', {})
407
+ worst_metric = analysis_results.get('lowest_performing_metric', {})
408
+
409
+ if best_metric.get('score', 0) >= 8.0:
410
+ insights.append(f"Strong performance in {best_metric.get('metric', 'unknown')} (score: {best_metric.get('score', 0):.1f})")
411
+
412
+ if worst_metric.get('score', 10) < 6.0:
413
+ insights.append(f"Significant improvement needed in {worst_metric.get('metric', 'unknown')} (score: {worst_metric.get('score', 0):.1f})")
414
+
415
+ score_dist = analysis_results.get('score_distribution', {})
416
+ if score_dist.get('score_range', 0) > 3.0:
417
+ insights.append("High variability in scores indicates inconsistent optimization across metrics")
418
+
419
+ return insights
420
+
421
+ def _prioritize_recommendations(self, recommendations: List[str]) -> List[Dict[str, Any]]:
422
+ """Prioritize recommendations based on impact potential"""
423
+ prioritized = []
424
+
425
+ # Simple prioritization based on keywords
426
+ high_impact_keywords = ['semantic', 'structure', 'authority', 'factual']
427
+ medium_impact_keywords = ['readability', 'clarity', 'format']
428
+
429
+ for i, rec in enumerate(recommendations):
430
+ priority = 'low'
431
+ if any(keyword in rec.lower() for keyword in high_impact_keywords):
432
+ priority = 'high'
433
+ elif any(keyword in rec.lower() for keyword in medium_impact_keywords):
434
+ priority = 'medium'
435
+
436
+ prioritized.append({
437
+ 'recommendation': rec,
438
+ 'priority': priority,
439
+ 'order': i + 1
440
+ })
441
+
442
+ # Sort by priority
443
+ priority_order = {'high': 1, 'medium': 2, 'low': 3}
444
+ prioritized.sort(key=lambda x: priority_order[x['priority']])
445
+
446
+ return prioritized
447
+
448
+ def _create_optimization_roadmap(self, analysis_results: Dict[str, Any]) -> Dict[str, List[str]]:
449
+ """Create a phased optimization roadmap"""
450
+ roadmap = {
451
+ 'immediate_actions': [],
452
+ 'short_term_goals': [],
453
+ 'long_term_strategy': []
454
+ }
455
+
456
+ overall_score = analysis_results.get('overall_score', 0)
457
+ worst_metric = analysis_results.get('lowest_performing_metric', {})
458
+
459
+ # Immediate actions based on worst performing metric
460
+ if worst_metric.get('score', 10) < 5.0:
461
+ roadmap['immediate_actions'].append(f"Address critical issues in {worst_metric.get('metric', 'low-scoring areas')}")
462
+
463
+ # Short-term goals
464
+ if overall_score < 7.0:
465
+ roadmap['short_term_goals'].append("Improve overall GEO score to above 7.0")
466
+ roadmap['short_term_goals'].append("Enhance content structure and semantic richness")
467
+
468
+ # Long-term strategy
469
+ roadmap['long_term_strategy'].append("Establish consistent GEO optimization process")
470
+ roadmap['long_term_strategy'].append("Monitor and track AI search performance")
471
+
472
+ return roadmap
473
+
474
+ def _assess_competitive_position(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
475
+ """Assess competitive position based on scores"""
476
+ overall_score = analysis_results.get('overall_score', 0)
477
+
478
+ if overall_score >= 8.5:
479
+ position = "market_leader"
480
+ description = "Content is highly optimized for AI search engines"
481
+ elif overall_score >= 7.0:
482
+ position = "competitive"
483
+ description = "Content performs well but has room for improvement"
484
+ elif overall_score >= 5.5:
485
+ position = "average"
486
+ description = "Content meets basic standards but lacks optimization"
487
+ else:
488
+ position = "needs_work"
489
+ description = "Content requires significant optimization for AI search"
490
+
491
+ return {
492
+ 'position': position,
493
+ 'description': description,
494
+ 'score': overall_score,
495
+ 'percentile_estimate': min(overall_score * 10, 100) # Rough percentile estimate
496
+ }
497
+
498
+ def _get_timestamp(self) -> str:
499
+ """Get current timestamp"""
500
+ from datetime import datetime
501
+ return datetime.now().strftime('%Y-%m-%d %H:%M:%S')