MBilal-72 commited on
Commit
b990fc0
·
verified ·
1 Parent(s): f4007a0

Update utils/optimizer.py

Browse files
Files changed (1) hide show
  1. utils/optimizer.py +166 -454
utils/optimizer.py CHANGED
@@ -1,7 +1,5 @@
1
- """
2
- Enhanced Content Optimization Module with RAG for GEO
3
- Integrates RAG functionality for better Generative Engine Optimization
4
- """
5
 
6
  import json
7
  import re
@@ -91,535 +89,249 @@ class ContentOptimizer:
91
 
92
  def setup_prompts(self):
93
  """Initialize optimization prompts with RAG integration"""
94
-
95
  self.rag_enhancement_prompt = """
96
- You are a Generative Engine Optimization (GEO) specialist with access to best practices knowledge.
97
-
98
- Based on the provided GEO knowledge and the user's content, optimize the content for:
99
- 1. AI search engines (ChatGPT, Claude, Gemini)
100
- 2. LLM-based question answering systems
101
- 3. Conversational AI interfaces
102
- 4. Citation and reference systems
103
-
104
- Use the knowledge base to inform your optimization decisions.
105
-
106
- Knowledge Base Context:
107
- {context}
108
-
109
- Original Content:
110
- {content}
111
-
112
- Provide comprehensive GEO optimization in JSON format:
113
- ```json
114
- {{
115
- "geo_analysis": {{
116
- "current_geo_score": 7.5,
117
- "ai_search_visibility": 8.0,
118
- "query_intent_matching": 7.0,
119
- "conversational_readiness": 8.5,
120
- "citation_worthiness": 6.5,
121
- "context_completeness": 7.5
122
- }},
123
- "optimization_opportunities": [
124
  {{
125
- "type": "Structure Enhancement",
126
- "description": "Add clear headings and Q&A format",
127
- "priority": "high",
128
- "expected_impact": "Improve AI parsing by 25%"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  }}
130
- ],
131
- "optimized_content": {{
132
- "enhanced_text": "Your optimized content here...",
133
- "structural_improvements": ["Added FAQ section", "Improved headings"],
134
- "semantic_enhancements": ["Added related terms", "Improved entity density"]
135
- }},
136
- "geo_keywords": {{
137
- "primary_entities": ["entity1", "entity2"],
138
- "semantic_terms": ["term1", "term2"],
139
- "question_patterns": ["What is...", "How does..."],
140
- "related_concepts": ["concept1", "concept2"]
141
- }},
142
- "recommendations": [
143
- "Add more specific examples",
144
- "Include authoritative citations",
145
- "Improve conversational flow"
146
- ]
147
- }}
148
- ```
149
- """
150
 
151
  self.competitive_geo_prompt = """
152
- Analyze the content against GEO best practices and identify competitive optimization opportunities.
153
-
154
- GEO Knowledge Base:
155
- {context}
156
-
157
- Content to Analyze:
158
- {content}
159
-
160
- Provide competitive GEO analysis:
161
- ```json
162
- {{
163
- "competitive_gaps": {{
164
- "missing_question_patterns": ["What questions aren't covered"],
165
- "entity_gaps": ["Important entities not mentioned"],
166
- "semantic_opportunities": ["Related terms to include"],
167
- "structural_weaknesses": ["Formatting issues for AI"]
168
- }},
169
- "benchmark_comparison": {{
170
- "current_performance": {{
171
- "ai_answerability": 6.5,
172
- "semantic_richness": 7.0,
173
- "structural_clarity": 8.0
174
- }},
175
- "optimization_potential": {{
176
- "ai_answerability": 9.0,
177
- "semantic_richness": 8.5,
178
- "structural_clarity": 9.5
179
- }}
180
- }},
181
- "action_plan": [
182
  {{
183
- "priority": "high",
184
- "action": "Add FAQ section",
185
- "rationale": "Improves direct question answering"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  }}
187
- ]
188
- }}
189
- ```
190
- """
191
 
192
- def optimize_content_with_rag(self, content: str, optimization_type: str = "geo_standard",
193
- analyze_only: bool = False) -> Dict[str, Any]:
194
- """
195
- Main RAG-enhanced content optimization for GEO
196
-
197
- Args:
198
- content (str): Content to optimize
199
- optimization_type (str): Type of GEO optimization
200
- analyze_only (bool): Whether to only analyze without rewriting
201
-
202
- Returns:
203
- Dict: Comprehensive GEO optimization results
204
- """
205
  try:
206
- # Create knowledge base documents
207
- knowledge_docs = [Document(page_content=knowledge, metadata={"source": "geo_best_practices"})
208
- for knowledge in self.geo_knowledge]
209
-
210
  if self.vector_chunker:
211
- # Use RAG to get relevant knowledge
212
  qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm)
213
-
214
- # Query for relevant GEO practices
215
  geo_query = f"How to optimize this type of content for AI search engines: {content[:500]}"
216
  context_result = qa_chain({"query": geo_query})
217
- context = context_result.get("result", "")
218
- else:
219
- # Fallback to using all knowledge if vector_chunker not available
220
- context = "\n\n".join(self.geo_knowledge)
221
-
222
- # Choose optimization approach
223
- if optimization_type == "competitive_geo":
224
- return self._competitive_geo_optimization(content, context)
225
- else:
226
- return self._standard_geo_optimization(content, context, analyze_only)
227
-
228
  except Exception as e:
229
- return {'error': f"RAG-enhanced optimization failed: {str(e)}"}
230
 
231
  def _standard_geo_optimization(self, content: str, context: str, analyze_only: bool) -> Dict[str, Any]:
232
- """Standard GEO optimization with RAG context"""
233
  try:
234
- prompt_template = ChatPromptTemplate.from_messages([
235
  SystemMessagePromptTemplate.from_template(self.rag_enhancement_prompt),
236
  HumanMessagePromptTemplate.from_template("Optimize this content using GEO best practices.")
237
  ])
238
-
239
- chain = prompt_template | self.llm
240
- result = chain.invoke({
241
- "context": context,
242
- "content": content[:5000] # Limit content length
243
- })
244
-
245
- result_content = result.content if hasattr(result, 'content') else str(result)
246
- parsed_result = self._parse_optimization_result(result_content)
247
-
248
- # Add metadata
249
- parsed_result.update({
250
  'optimization_type': 'geo_standard',
251
  'rag_enhanced': True,
252
  'analyze_only': analyze_only,
253
  'original_length': len(content),
254
  'knowledge_sources': len(self.geo_knowledge)
255
  })
256
-
257
- return parsed_result
258
-
259
  except Exception as e:
260
- return {'error': f"Standard GEO optimization failed: {str(e)}"}
261
 
262
  def _competitive_geo_optimization(self, content: str, context: str) -> Dict[str, Any]:
263
- """Competitive GEO analysis with RAG context"""
264
  try:
265
- prompt_template = ChatPromptTemplate.from_messages([
266
  SystemMessagePromptTemplate.from_template(self.competitive_geo_prompt),
267
  HumanMessagePromptTemplate.from_template("Perform competitive GEO analysis.")
268
  ])
269
-
270
- chain = prompt_template | self.llm
271
- result = chain.invoke({
272
- "context": context,
273
- "content": content[:5000]
274
- })
275
-
276
- result_content = result.content if hasattr(result, 'content') else str(result)
277
- parsed_result = self._parse_optimization_result(result_content)
278
-
279
- parsed_result.update({
280
  'optimization_type': 'competitive_geo',
281
  'rag_enhanced': True,
282
  'competitive_analysis': True
283
  })
284
-
285
- return parsed_result
286
-
287
  except Exception as e:
288
- return {'error': f"Competitive GEO optimization failed: {str(e)}"}
289
 
290
  def batch_optimize_with_rag(self, content_list: List[str], optimization_type: str = "geo_standard") -> List[Dict[str, Any]]:
291
- """
292
- Batch optimize multiple content pieces with RAG
293
-
294
- Args:
295
- content_list: List of content to optimize
296
- optimization_type: Type of optimization
297
-
298
- Returns:
299
- List of optimization results
300
- """
301
  results = []
302
-
303
  for i, content in enumerate(content_list):
304
  try:
305
- result = self.optimize_content_with_rag(
306
- content,
307
- optimization_type=optimization_type
308
- )
309
  result['batch_index'] = i
310
  results.append(result)
311
-
312
  except Exception as e:
313
  results.append({
314
  'batch_index': i,
315
  'error': f"Batch GEO optimization failed: {str(e)}"
316
  })
317
-
318
  return results
319
 
320
  def analyze_geo_readability(self, content: str) -> Dict[str, Any]:
321
- """
322
- Analyze content readability specifically for GEO/AI systems
323
- """
324
  try:
325
- # Basic metrics
326
  words = content.split()
327
- sentences = re.split(r'[.!?]+', content)
328
- sentences = [s.strip() for s in sentences if s.strip()]
329
  paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
330
-
331
- # GEO-specific analysis
332
- questions = len(re.findall(r'\?', content))
333
- headings = len(re.findall(r'^#+\s', content, re.MULTILINE))
334
- lists = len(re.findall(r'^\s*[-*+]\s', content, re.MULTILINE))
335
- numbers = len(re.findall(r'\b\d+\.?\d*\b', content))
336
-
337
- # Entity-like patterns (proper nouns)
338
- entities = len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content))
339
-
340
- # Calculate GEO readability score
341
  geo_score = self._calculate_geo_readability_score({
342
- 'avg_words_per_sentence': len(words) / len(sentences) if sentences else 0,
343
- 'questions_ratio': questions / len(sentences) if sentences else 0,
344
- 'structure_elements': headings + lists,
345
- 'entity_density': entities / len(words) if words else 0,
346
- 'numeric_data': numbers / len(words) if words else 0
347
  })
348
-
349
  return {
350
- 'geo_readability_metrics': {
351
- 'total_words': len(words),
352
- 'total_sentences': len(sentences),
353
- 'total_paragraphs': len(paragraphs),
354
- 'questions_count': questions,
355
- 'headings_count': headings,
356
- 'lists_count': lists,
357
- 'entity_mentions': entities,
358
- 'numeric_data_points': numbers
359
- },
360
  'geo_readability_score': geo_score,
361
- 'ai_optimization_indicators': {
362
- 'question_ratio': questions / len(sentences) if sentences else 0,
363
- 'structure_score': min(10, (headings + lists) * 2),
364
- 'entity_density': entities / len(words) if words else 0,
365
- 'data_richness': numbers / len(words) if words else 0
366
- },
367
- 'geo_recommendations': self._generate_geo_recommendations({
368
- 'questions': questions,
369
- 'headings': headings,
370
- 'lists': lists,
371
- 'entities': entities,
372
- 'sentences': len(sentences)
373
- })
374
  }
375
-
376
  except Exception as e:
377
  return {'error': f"GEO readability analysis failed: {str(e)}"}
378
 
379
- def extract_geo_entities(self, content: str) -> Dict[str, Any]:
380
- """
381
- Extract entities and concepts relevant for GEO optimization
382
- """
383
- try:
384
- if not self.vector_chunker:
385
- return {'error': 'Vector chunker not available for entity extraction'}
386
-
387
- # Create knowledge context about entity extraction
388
- entity_knowledge = [Document(
389
- page_content="""
390
- For GEO optimization, important entities include:
391
- 1. Named entities: People, organizations, locations, brands
392
- 2. Technical concepts: Industry terms, methodologies, tools
393
- 3. Topical entities: Core subjects, themes, categories
394
- 4. Relational entities: Connected concepts, dependencies
395
- 5. Question entities: What users commonly ask about
396
- """,
397
- metadata={"source": "entity_extraction_guide"}
398
- )]
399
-
400
- qa_chain = self.vector_chunker.create_qa_chain(entity_knowledge, self.llm)
401
-
402
- # Extract different types of entities
403
- extraction_queries = [
404
- "What are the main named entities (people, places, organizations) in this content?",
405
- "What are the key technical concepts and terms?",
406
- "What questions might users have about this content?",
407
- "What related topics and concepts are mentioned?"
408
- ]
409
-
410
- extracted_data = {}
411
- for query in extraction_queries:
412
- full_query = f"{query}\n\nContent: {content[:3000]}"
413
- result = qa_chain({"query": full_query})
414
- query_key = query.split('?')[0].lower().replace(' ', '_').replace('what_are_the_', '')
415
- extracted_data[query_key] = result.get("result", "")
416
-
417
- return {
418
- 'geo_entities': extracted_data,
419
- 'extraction_method': 'rag_enhanced',
420
- 'content_length': len(content),
421
- 'extraction_success': True
422
- }
423
-
424
- except Exception as e:
425
- return {'error': f"GEO entity extraction failed: {str(e)}"}
426
-
427
- def generate_geo_variations(self, content: str, num_variations: int = 3) -> List[Dict[str, Any]]:
428
- """
429
- Generate GEO-optimized content variations using RAG
430
- """
431
- variations = []
432
-
433
- variation_types = [
434
- ("faq_focused", "Transform into FAQ format optimized for AI Q&A systems"),
435
- ("conversational", "Optimize for conversational AI and voice search"),
436
- ("authoritative", "Enhance with authoritative tone for citation systems")
437
- ]
438
-
439
  try:
440
- # Get GEO context
441
- knowledge_docs = [Document(page_content=knowledge, metadata={"source": "geo_practices"})
442
- for knowledge in self.geo_knowledge]
443
-
444
- if self.vector_chunker:
445
- qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm)
446
-
447
- for i, (variation_type, description) in enumerate(variation_types[:num_variations]):
448
- try:
449
- # Get specific guidance for this variation type
450
- context_query = f"How to optimize content for {variation_type} in AI systems?"
451
- context_result = qa_chain({"query": context_query})
452
- context = context_result.get("result", "")
453
-
454
- variation_prompt = f"""
455
- Create a {variation_type} version of the content optimized for GEO.
456
-
457
- Context: {context}
458
-
459
- Original Content: {content[:4000]}
460
-
461
- Variation Goal: {description}
462
-
463
- Return JSON:
464
- {{
465
- "variation_type": "{variation_type}",
466
- "optimized_content": "the rewritten content...",
467
- "geo_improvements": ["improvement 1", "improvement 2"],
468
- "target_ai_systems": ["ChatGPT", "Claude", "etc"],
469
- "expected_geo_benefits": ["benefit 1", "benefit 2"]
470
- }}
471
- """
472
-
473
- prompt_template = ChatPromptTemplate.from_messages([
474
- SystemMessagePromptTemplate.from_template(variation_prompt),
475
- HumanMessagePromptTemplate.from_template("Generate the GEO-optimized variation.")
476
- ])
477
-
478
- chain = prompt_template | self.llm
479
- result = chain.invoke({})
480
-
481
- result_content = result.content if hasattr(result, 'content') else str(result)
482
- parsed_result = self._parse_optimization_result(result_content)
483
-
484
- parsed_result.update({
485
- 'variation_index': i,
486
- 'rag_enhanced': True,
487
- 'geo_optimized': True
488
- })
489
-
490
- variations.append(parsed_result)
491
-
492
- except Exception as e:
493
- variations.append({
494
- 'variation_index': i,
495
- 'variation_type': variation_type,
496
- 'error': f"GEO variation generation failed: {str(e)}"
497
- })
498
- else:
499
- return [{'error': 'Vector chunker not available for variation generation'}]
500
-
501
- except Exception as e:
502
- return [{'error': f"GEO variation generation failed: {str(e)}"}]
503
-
504
- return variations
505
-
506
- def _calculate_geo_readability_score(self, metrics: Dict[str, float]) -> float:
507
- """Calculate GEO-specific readability score"""
508
- try:
509
- # GEO-optimized scoring
510
- sentence_score = max(0, 10 - abs(metrics['avg_words_per_sentence'] - 15) * 0.3)
511
- question_score = min(10, metrics['questions_ratio'] * 50) # Reward questions
512
- structure_score = min(10, metrics['structure_elements'] * 1.5) # Reward headings/lists
513
- entity_score = min(10, metrics['entity_density'] * 100) # Reward entities
514
- data_score = min(10, metrics['numeric_data'] * 200) # Reward data points
515
-
516
- # Weighted for GEO priorities
517
- overall_score = (
518
- sentence_score * 0.2 +
519
- question_score * 0.25 +
520
- structure_score * 0.25 +
521
- entity_score * 0.15 +
522
- data_score * 0.15
523
  )
524
-
525
- return round(overall_score, 1)
526
-
527
  except Exception:
528
  return 5.0
529
 
530
- def _generate_geo_recommendations(self, metrics: Dict[str, int]) -> List[str]:
531
- """Generate GEO-specific recommendations"""
532
- recommendations = []
533
-
534
- try:
535
- if metrics['questions'] == 0:
536
- recommendations.append("Add FAQ section or question-based headings for better AI Q&A performance")
537
-
538
- if metrics['headings'] < 2:
539
- recommendations.append("Add more structured headings to improve AI content parsing")
540
-
541
- if metrics['lists'] == 0:
542
- recommendations.append("Include bullet points or numbered lists for better information extraction")
543
-
544
- if metrics['entities'] < 5:
545
- recommendations.append("Include more specific entities (names, places, organizations) for authority")
546
-
547
- if metrics['questions'] / metrics['sentences'] < 0.1:
548
- recommendations.append("Consider transforming statements into question-answer pairs")
549
-
550
- return recommendations
551
-
552
- except Exception:
553
- return ["Unable to generate specific GEO recommendations"]
554
 
555
  def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]:
556
- """Parse LLM response and extract structured results"""
557
  try:
558
- # Find JSON content in the response
559
- json_start = response_text.find('{')
560
- json_end = response_text.rfind('}') + 1
561
-
562
- if json_start != -1 and json_end != -1:
563
- json_str = response_text[json_start:json_end]
564
- parsed = json.loads(json_str)
565
- return parsed
566
- else:
567
- # If no JSON found, return structured error
568
- return {
569
- 'raw_response': response_text,
570
- 'parsing_error': 'No JSON structure found in response',
571
- 'geo_analysis': {
572
- 'current_geo_score': 0,
573
- 'ai_search_visibility': 0,
574
- 'query_intent_matching': 0,
575
- 'conversational_readiness': 0,
576
- 'citation_worthiness': 0,
577
- 'context_completeness': 0
578
- }
579
- }
580
-
581
- except json.JSONDecodeError as e:
582
  return {
583
  'raw_response': response_text,
584
- 'parsing_error': f'JSON decode error: {str(e)}',
585
- 'geo_analysis': {
586
- 'current_geo_score': 0,
587
- 'ai_search_visibility': 0,
588
- 'query_intent_matching': 0,
589
- 'conversational_readiness': 0,
590
- 'citation_worthiness': 0,
591
- 'context_completeness': 0
592
- }
593
  }
594
  except Exception as e:
595
  return {
596
  'raw_response': response_text,
597
- 'parsing_error': f'Unexpected parsing error: {str(e)}',
598
- 'geo_analysis': {
599
- 'current_geo_score': 0,
600
- 'ai_search_visibility': 0,
601
- 'query_intent_matching': 0,
602
- 'conversational_readiness': 0,
603
- 'citation_worthiness': 0,
604
- 'context_completeness': 0
605
- }
606
  }
607
 
608
- # Legacy methods for backward compatibility
609
- def optimize_content(self, content: str, analyze_only: bool = False,
610
- include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]:
611
- """
612
- Legacy method - redirects to RAG-enhanced optimization
613
- """
614
- if optimization_type == "standard":
615
- return self.optimize_content_with_rag(content, "geo_standard", analyze_only)
616
- elif optimization_type == "seo":
617
- return self.optimize_content_with_rag(content, "geo_standard", analyze_only)
618
- elif optimization_type == "competitive":
619
- return self.optimize_content_with_rag(content, "competitive_geo", analyze_only)
620
- else:
621
- return self.optimize_content_with_rag(content, "geo_standard", analyze_only)
622
 
623
  def analyze_content_readability(self, content: str) -> Dict[str, Any]:
624
- """Legacy method - redirects to GEO readability analysis"""
625
- return self.analyze_geo_readability(content)
 
1
+ # Enhanced Content Optimization Module with RAG for GEO
2
+ # Integrates RAG functionality for better Generative Engine Optimization
 
 
3
 
4
  import json
5
  import re
 
89
 
90
  def setup_prompts(self):
91
  """Initialize optimization prompts with RAG integration"""
 
92
  self.rag_enhancement_prompt = """
93
+ You are a Generative Engine Optimization (GEO) specialist with access to best practices knowledge.
94
+
95
+ Based on the provided GEO knowledge and the user's content, optimize the content for:
96
+ 1. AI search engines (ChatGPT, Claude, Gemini)
97
+ 2. LLM-based question answering systems
98
+ 3. Conversational AI interfaces
99
+ 4. Citation and reference systems
100
+
101
+ Use the knowledge base to inform your optimization decisions.
102
+
103
+ Knowledge Base Context:
104
+ {context}
105
+
106
+ Original Content:
107
+ {content}
108
+
109
+ Provide comprehensive GEO optimization in JSON format:
110
+ ```json
 
 
 
 
 
 
 
 
 
 
111
  {{
112
+ "geo_analysis": {{
113
+ "current_geo_score": 7.5,
114
+ "ai_search_visibility": 8.0,
115
+ "query_intent_matching": 7.0,
116
+ "conversational_readiness": 8.5,
117
+ "citation_worthiness": 6.5,
118
+ "context_completeness": 7.5
119
+ }},
120
+ "optimization_opportunities": [
121
+ {{
122
+ "type": "Structure Enhancement",
123
+ "description": "Add clear headings and Q&A format",
124
+ "priority": "high",
125
+ "expected_impact": "Improve AI parsing by 25%"
126
+ }}
127
+ ],
128
+ "optimized_content": {{
129
+ "enhanced_text": "Your optimized content here...",
130
+ "structural_improvements": ["Added FAQ section", "Improved headings"],
131
+ "semantic_enhancements": ["Added related terms", "Improved entity density"]
132
+ }},
133
+ "geo_keywords": {{
134
+ "primary_entities": ["entity1", "entity2"],
135
+ "semantic_terms": ["term1", "term2"],
136
+ "question_patterns": ["What is...", "How does..."],
137
+ "related_concepts": ["concept1", "concept2"]
138
+ }},
139
+ "recommendations": [
140
+ "Add more specific examples",
141
+ "Include authoritative citations",
142
+ "Improve conversational flow"
143
+ ]
144
  }}
145
+ ```
146
+ """.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  self.competitive_geo_prompt = """
149
+ Analyze the content against GEO best practices and identify competitive optimization opportunities.
150
+
151
+ GEO Knowledge Base:
152
+ {context}
153
+
154
+ Content to Analyze:
155
+ {content}
156
+
157
+ Provide competitive GEO analysis:
158
+ ```json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  {{
160
+ "competitive_gaps": {{
161
+ "missing_question_patterns": ["What questions aren't covered"],
162
+ "entity_gaps": ["Important entities not mentioned"],
163
+ "semantic_opportunities": ["Related terms to include"],
164
+ "structural_weaknesses": ["Formatting issues for AI"]
165
+ }},
166
+ "benchmark_comparison": {{
167
+ "current_performance": {{
168
+ "ai_answerability": 6.5,
169
+ "semantic_richness": 7.0,
170
+ "structural_clarity": 8.0
171
+ }},
172
+ "optimization_potential": {{
173
+ "ai_answerability": 9.0,
174
+ "semantic_richness": 8.5,
175
+ "structural_clarity": 9.5
176
+ }}
177
+ }},
178
+ "action_plan": [
179
+ {{
180
+ "priority": "high",
181
+ "action": "Add FAQ section",
182
+ "rationale": "Improves direct question answering"
183
+ }}
184
+ ]
185
  }}
186
+ ```
187
+ """.strip()
 
 
188
 
189
+
190
+
191
+ def optimize_content_with_rag(self, content: str, optimization_type: str = "geo_standard", analyze_only: bool = False) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
192
  try:
193
+ knowledge_docs = [Document(page_content=k, metadata={"source": "geo_best_practices"}) for k in self.geo_knowledge]
194
+ context = "\n\n".join(self.geo_knowledge)
195
+
 
196
  if self.vector_chunker:
 
197
  qa_chain = self.vector_chunker.create_qa_chain(knowledge_docs, self.llm)
 
 
198
  geo_query = f"How to optimize this type of content for AI search engines: {content[:500]}"
199
  context_result = qa_chain({"query": geo_query})
200
+ context = context_result.get("result", context)
201
+
202
+ return self._competitive_geo_optimization(content, context) if optimization_type == "competitive_geo" else self._standard_geo_optimization(content, context, analyze_only)
203
+
 
 
 
 
 
 
 
204
  except Exception as e:
205
+ return {"error": f"RAG-enhanced optimization failed: {str(e)}"}
206
 
207
  def _standard_geo_optimization(self, content: str, context: str, analyze_only: bool) -> Dict[str, Any]:
 
208
  try:
209
+ prompt = ChatPromptTemplate.from_messages([
210
  SystemMessagePromptTemplate.from_template(self.rag_enhancement_prompt),
211
  HumanMessagePromptTemplate.from_template("Optimize this content using GEO best practices.")
212
  ])
213
+ result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]})
214
+ parsed = self._parse_optimization_result(getattr(result, 'content', str(result)))
215
+ parsed.update({
 
 
 
 
 
 
 
 
 
216
  'optimization_type': 'geo_standard',
217
  'rag_enhanced': True,
218
  'analyze_only': analyze_only,
219
  'original_length': len(content),
220
  'knowledge_sources': len(self.geo_knowledge)
221
  })
222
+ return parsed
 
 
223
  except Exception as e:
224
+ return {"error": f"Standard GEO optimization failed: {str(e)}"}
225
 
226
  def _competitive_geo_optimization(self, content: str, context: str) -> Dict[str, Any]:
 
227
  try:
228
+ prompt = ChatPromptTemplate.from_messages([
229
  SystemMessagePromptTemplate.from_template(self.competitive_geo_prompt),
230
  HumanMessagePromptTemplate.from_template("Perform competitive GEO analysis.")
231
  ])
232
+ result = (prompt | self.llm).invoke({"context": context, "content": content[:5000]})
233
+ parsed = self._parse_optimization_result(getattr(result, 'content', str(result)))
234
+ parsed.update({
 
 
 
 
 
 
 
 
235
  'optimization_type': 'competitive_geo',
236
  'rag_enhanced': True,
237
  'competitive_analysis': True
238
  })
239
+ return parsed
 
 
240
  except Exception as e:
241
+ return {"error": f"Competitive GEO optimization failed: {str(e)}"}
242
 
243
  def batch_optimize_with_rag(self, content_list: List[str], optimization_type: str = "geo_standard") -> List[Dict[str, Any]]:
 
 
 
 
 
 
 
 
 
 
244
  results = []
 
245
  for i, content in enumerate(content_list):
246
  try:
247
+ result = self.optimize_content_with_rag(content, optimization_type)
 
 
 
248
  result['batch_index'] = i
249
  results.append(result)
 
250
  except Exception as e:
251
  results.append({
252
  'batch_index': i,
253
  'error': f"Batch GEO optimization failed: {str(e)}"
254
  })
 
255
  return results
256
 
257
  def analyze_geo_readability(self, content: str) -> Dict[str, Any]:
 
 
 
258
  try:
 
259
  words = content.split()
260
+ sentences = [s.strip() for s in re.split(r'[.!?]+', content) if s.strip()]
 
261
  paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
262
+
263
+ metrics = {
264
+ 'questions': len(re.findall(r'\?', content)),
265
+ 'headings': len(re.findall(r'^#+\s', content, re.MULTILINE)),
266
+ 'lists': len(re.findall(r'^\s*[-*+]\s', content, re.MULTILINE)),
267
+ 'entities': len(re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', content)),
268
+ 'numbers': len(re.findall(r'\b\d+\.?\d*\b', content)),
269
+ 'sentence_count': len(sentences),
270
+ 'word_count': len(words)
271
+ }
272
+
273
  geo_score = self._calculate_geo_readability_score({
274
+ 'avg_words_per_sentence': metrics['word_count'] / metrics['sentence_count'] if metrics['sentence_count'] else 0,
275
+ 'questions_ratio': metrics['questions'] / metrics['sentence_count'] if metrics['sentence_count'] else 0,
276
+ 'structure_elements': metrics['headings'] + metrics['lists'],
277
+ 'entity_density': metrics['entities'] / metrics['word_count'] if metrics['word_count'] else 0,
278
+ 'numeric_data': metrics['numbers'] / metrics['word_count'] if metrics['word_count'] else 0
279
  })
280
+
281
  return {
282
+ 'geo_readability_metrics': metrics,
 
 
 
 
 
 
 
 
 
283
  'geo_readability_score': geo_score,
284
+ 'geo_recommendations': self._generate_geo_recommendations(metrics)
 
 
 
 
 
 
 
 
 
 
 
 
285
  }
 
286
  except Exception as e:
287
  return {'error': f"GEO readability analysis failed: {str(e)}"}
288
 
289
+ def _calculate_geo_readability_score(self, m: Dict[str, float]) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  try:
291
+ score = (
292
+ max(0, 10 - abs(m['avg_words_per_sentence'] - 15) * 0.3) * 0.2 +
293
+ min(10, m['questions_ratio'] * 50) * 0.25 +
294
+ min(10, m['structure_elements'] * 1.5) * 0.25 +
295
+ min(10, m['entity_density'] * 100) * 0.15 +
296
+ min(10, m['numeric_data'] * 200) * 0.15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  )
298
+ return round(score, 1)
 
 
299
  except Exception:
300
  return 5.0
301
 
302
+ def _generate_geo_recommendations(self, m: Dict[str, int]) -> List[str]:
303
+ r = []
304
+ if m['questions'] == 0:
305
+ r.append("Add FAQ section or question-based headings.")
306
+ if m['headings'] < 2:
307
+ r.append("Use more structured headings.")
308
+ if m['lists'] == 0:
309
+ r.append("Include bullet points or numbered lists.")
310
+ if m['entities'] < 5:
311
+ r.append("Add named or topical entities.")
312
+ if m['questions'] / m['sentence_count'] < 0.1:
313
+ r.append("Transform statements into Q&A pairs.")
314
+ return r
 
 
 
 
 
 
 
 
 
 
 
315
 
316
  def _parse_optimization_result(self, response_text: str) -> Dict[str, Any]:
 
317
  try:
318
+ start = response_text.find('{')
319
+ end = response_text.rfind('}') + 1
320
+ if start != -1 and end != -1:
321
+ return json.loads(response_text[start:end])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  return {
323
  'raw_response': response_text,
324
+ 'parsing_error': 'No JSON structure found.'
 
 
 
 
 
 
 
 
325
  }
326
  except Exception as e:
327
  return {
328
  'raw_response': response_text,
329
+ 'parsing_error': f'JSON parsing error: {str(e)}'
 
 
 
 
 
 
 
 
330
  }
331
 
332
+ # Legacy support methods
333
+ def optimize_content(self, content: str, analyze_only: bool = False, include_keywords: bool = True, optimization_type: str = "standard") -> Dict[str, Any]:
334
+ return self.optimize_content_with_rag(content, optimization_type, analyze_only)
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  def analyze_content_readability(self, content: str) -> Dict[str, Any]:
337
+ return self.analyze_geo_readability(content)