MBilal-72 commited on
Commit
369da03
·
verified ·
1 Parent(s): f5b1b84

corrected analyze_page_geo in utils/scorer.py

Browse files
Files changed (1) hide show
  1. utils/scorer.py +492 -500
utils/scorer.py CHANGED
@@ -1,501 +1,493 @@
1
- """
2
- GEO Scoring Module
3
- Analyzes content for Generative Engine Optimization (GEO) performance
4
- """
5
-
6
- import json
7
- from typing import Dict, Any, List
8
- from langchain.prompts import ChatPromptTemplate
9
-
10
-
11
- class GEOScorer:
12
- """Main class for calculating GEO scores and analysis"""
13
-
14
- def __init__(self, llm):
15
- self.llm = llm
16
- self.setup_prompts()
17
-
18
- def setup_prompts(self):
19
- """Initialize prompts for different types of analysis"""
20
-
21
- # Main GEO analysis prompt
22
- self.geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided content for its effectiveness in AI-powered search engines and LLM systems.
23
-
24
- Evaluate the content based on these GEO criteria (score 1-10 each):
25
-
26
- 1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
27
- 2. **Query Intent Matching**: How well does the content match common user queries?
28
- 3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
29
- 4. **Conversational Readiness**: How suitable is the content for AI chat responses?
30
- 5. **Semantic Richness**: How well does the content use relevant semantic keywords?
31
- 6. **Context Completeness**: Does the content provide complete, self-contained answers?
32
- 7. **Citation Worthiness**: How likely are AI systems to cite this content?
33
- 8. **Multi-Query Coverage**: Does the content answer multiple related questions?
34
-
35
- Also identify:
36
- - Primary topics and entities
37
- - Missing information gaps
38
- - Optimization opportunities
39
- - Specific enhancement recommendations
40
-
41
- Format your response as JSON:
42
-
43
- ```json
44
- {
45
- "geo_scores": {
46
- "ai_search_visibility": 7.5,
47
- "query_intent_matching": 8.0,
48
- "factual_accuracy": 9.0,
49
- "conversational_readiness": 6.5,
50
- "semantic_richness": 7.0,
51
- "context_completeness": 8.5,
52
- "citation_worthiness": 7.8,
53
- "multi_query_coverage": 6.0
54
- },
55
- "overall_geo_score": 7.5,
56
- "primary_topics": ["topic1", "topic2"],
57
- "entities": ["entity1", "entity2"],
58
- "missing_gaps": ["gap1", "gap2"],
59
- "optimization_opportunities": [
60
- {
61
- "type": "semantic_enhancement",
62
- "description": "Add more related terms",
63
- "priority": "high"
64
- }
65
- ],
66
- "recommendations": [
67
- "Specific actionable recommendation 1",
68
- "Specific actionable recommendation 2"
69
- ]
70
- }
71
- ```"""
72
-
73
- # Quick scoring prompt for faster analysis
74
- self.quick_score_prompt = """Analyze this content for AI search optimization. Provide scores (1-10) for:
75
-
76
- 1. AI Search Visibility
77
- 2. Query Intent Matching
78
- 3. Conversational Readiness
79
- 4. Citation Worthiness
80
-
81
- Respond in JSON format:
82
- ```json
83
- {
84
- "scores": {
85
- "ai_search_visibility": 7.5,
86
- "query_intent_matching": 8.0,
87
- "conversational_readiness": 6.5,
88
- "citation_worthiness": 7.8
89
- },
90
- "overall_score": 7.5,
91
- "top_recommendation": "Most important improvement needed"
92
- }
93
- ```"""
94
-
95
- # Competitive analysis prompt
96
- self.competitive_prompt = """Compare these content pieces for GEO performance. Identify which performs better for AI search and why.
97
-
98
- Content A: {content_a}
99
-
100
- Content B: {content_b}
101
-
102
- Provide analysis in JSON:
103
- ```json
104
- {
105
- "winner": "A" or "B",
106
- "score_comparison": {
107
- "content_a_score": 7.5,
108
- "content_b_score": 8.2
109
- },
110
- "key_differences": ["difference1", "difference2"],
111
- "improvement_suggestions": {
112
- "content_a": ["suggestion1"],
113
- "content_b": ["suggestion1"]
114
- }
115
- }
116
- ```"""
117
-
118
- def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
119
- """
120
- Analyze a single page for GEO performance
121
-
122
- Args:
123
- content (str): Page content to analyze
124
- title (str): Page title
125
- detailed (bool): Whether to perform detailed analysis
126
-
127
- Returns:
128
- Dict: GEO analysis results
129
- """
130
- try:
131
- # Choose prompt based on detail level
132
- if detailed:
133
- prompt_template = ChatPromptTemplate.from_messages([
134
- ("system", self.geo_analysis_prompt),
135
- ("user", f"Title: {title}\n\nContent: {content[:8000]}") # Limit content length
136
- ])
137
- else:
138
- prompt_template = ChatPromptTemplate.from_messages([
139
- ("system", self.quick_score_prompt),
140
- ("user", f"Title: {title}\n\nContent: {content[:4000]}")
141
- ])
142
-
143
- # Run analysis
144
- chain = prompt_template | self.llm
145
- result = chain.invoke({})
146
-
147
- # Extract and parse result
148
- result_content = result.content if hasattr(result, 'content') else str(result)
149
- parsed_result = self._parse_llm_response(result_content)
150
-
151
- # Add metadata
152
- parsed_result.update({
153
- 'analyzed_title': title,
154
- 'content_length': len(content),
155
- 'word_count': len(content.split()),
156
- 'analysis_type': 'detailed' if detailed else 'quick'
157
- })
158
-
159
- return parsed_result
160
-
161
- except Exception as e:
162
- return {'error': f"GEO analysis failed: {str(e)}"}
163
-
164
- def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
165
- """
166
- Analyze multiple pages and return consolidated results
167
-
168
- Args:
169
- pages_data (List[Dict]): List of page data with content and metadata
170
- detailed (bool): Whether to perform detailed analysis
171
-
172
- Returns:
173
- List[Dict]: List of GEO analysis results
174
- """
175
- results = []
176
-
177
- for i, page_data in enumerate(pages_data):
178
- try:
179
- content = page_data.get('content', '')
180
- title = page_data.get('title', f'Page {i+1}')
181
-
182
- analysis = self.analyze_page_geo(content, title, detailed)
183
-
184
- # Add page-specific metadata
185
- analysis.update({
186
- 'page_url': page_data.get('url', ''),
187
- 'page_index': i,
188
- 'source_word_count': page_data.get('word_count', 0)
189
- })
190
-
191
- results.append(analysis)
192
-
193
- except Exception as e:
194
- results.append({
195
- 'page_index': i,
196
- 'page_url': page_data.get('url', ''),
197
- 'error': f"Analysis failed: {str(e)}"
198
- })
199
-
200
- return results
201
-
202
- def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
203
- """
204
- Compare two pieces of content for GEO performance
205
-
206
- Args:
207
- content_a (str): First content to compare
208
- content_b (str): Second content to compare
209
- titles (tuple): Optional titles for the content pieces
210
-
211
- Returns:
212
- Dict: Comparison analysis results
213
- """
214
- try:
215
- title_a, title_b = titles if titles else ("Content A", "Content B")
216
-
217
- prompt_template = ChatPromptTemplate.from_messages([
218
- ("system", self.competitive_prompt),
219
- ("user", "")
220
- ])
221
-
222
- # Format the competitive analysis prompt
223
- formatted_prompt = self.competitive_prompt.format(
224
- content_a=f"Title: {title_a}\nContent: {content_a[:4000]}",
225
- content_b=f"Title: {title_b}\nContent: {content_b[:4000]}"
226
- )
227
-
228
- chain = ChatPromptTemplate.from_messages([
229
- ("system", formatted_prompt),
230
- ("user", "Perform the comparison analysis.")
231
- ]) | self.llm
232
-
233
- result = chain.invoke({})
234
- result_content = result.content if hasattr(result, 'content') else str(result)
235
-
236
- return self._parse_llm_response(result_content)
237
-
238
- except Exception as e:
239
- return {'error': f"Comparison analysis failed: {str(e)}"}
240
-
241
- def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
242
- """
243
- Calculate aggregate GEO scores from multiple page analyses
244
-
245
- Args:
246
- individual_results (List[Dict]): List of individual page analysis results
247
-
248
- Returns:
249
- Dict: Aggregate scores and insights
250
- """
251
- try:
252
- valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
253
-
254
- if not valid_results:
255
- return {'error': 'No valid results to aggregate'}
256
-
257
- # Calculate average scores
258
- score_keys = list(valid_results[0]['geo_scores'].keys())
259
- avg_scores = {}
260
-
261
- for key in score_keys:
262
- scores = [r['geo_scores'][key] for r in valid_results if key in r['geo_scores']]
263
- avg_scores[key] = sum(scores) / len(scores) if scores else 0
264
-
265
- overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
266
-
267
- # Collect all recommendations and opportunities
268
- all_recommendations = []
269
- all_opportunities = []
270
- all_topics = []
271
- all_entities = []
272
-
273
- for result in valid_results:
274
- all_recommendations.extend(result.get('recommendations', []))
275
- all_opportunities.extend(result.get('optimization_opportunities', []))
276
- all_topics.extend(result.get('primary_topics', []))
277
- all_entities.extend(result.get('entities', []))
278
-
279
- # Remove duplicates and prioritize
280
- unique_recommendations = list(set(all_recommendations))
281
- unique_topics = list(set(all_topics))
282
- unique_entities = list(set(all_entities))
283
-
284
- # Find highest and lowest performing areas
285
- best_score = max(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
286
- worst_score = min(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
287
-
288
- return {
289
- 'aggregate_scores': avg_scores,
290
- 'overall_score': overall_avg,
291
- 'pages_analyzed': len(valid_results),
292
- 'best_performing_metric': {
293
- 'metric': best_score[0],
294
- 'score': best_score[1]
295
- },
296
- 'lowest_performing_metric': {
297
- 'metric': worst_score[0],
298
- 'score': worst_score[1]
299
- },
300
- 'consolidated_recommendations': unique_recommendations[:10],
301
- 'all_topics': unique_topics,
302
- 'all_entities': unique_entities,
303
- 'high_priority_opportunities': [
304
- opp for opp in all_opportunities
305
- if opp.get('priority') == 'high'
306
- ][:5],
307
- 'score_distribution': self._calculate_score_distribution(avg_scores)
308
- }
309
-
310
- except Exception as e:
311
- return {'error': f"Aggregation failed: {str(e)}"}
312
-
313
- def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
314
- """
315
- Generate a comprehensive GEO report
316
-
317
- Args:
318
- analysis_results (Dict): Results from aggregate analysis
319
- website_url (str): Optional website URL for context
320
-
321
- Returns:
322
- Dict: Comprehensive GEO report
323
- """
324
- try:
325
- report = {
326
- 'report_metadata': {
327
- 'generated_at': self._get_timestamp(),
328
- 'website_url': website_url,
329
- 'analysis_type': 'GEO Performance Report'
330
- },
331
- 'executive_summary': self._generate_executive_summary(analysis_results),
332
- 'detailed_scores': analysis_results.get('aggregate_scores', {}),
333
- 'performance_insights': self._generate_performance_insights(analysis_results),
334
- 'actionable_recommendations': self._prioritize_recommendations(
335
- analysis_results.get('consolidated_recommendations', [])
336
- ),
337
- 'optimization_roadmap': self._create_optimization_roadmap(analysis_results),
338
- 'competitive_position': self._assess_competitive_position(analysis_results),
339
- 'technical_details': {
340
- 'pages_analyzed': analysis_results.get('pages_analyzed', 0),
341
- 'overall_score': analysis_results.get('overall_score', 0),
342
- 'score_distribution': analysis_results.get('score_distribution', {})
343
- }
344
- }
345
-
346
- return report
347
-
348
- except Exception as e:
349
- return {'error': f"Report generation failed: {str(e)}"}
350
-
351
- def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
352
- """Parse LLM response and extract JSON content"""
353
- try:
354
- # Find JSON content in the response
355
- json_start = response_text.find('{')
356
- json_end = response_text.rfind('}') + 1
357
-
358
- if json_start != -1 and json_end != -1:
359
- json_str = response_text[json_start:json_end]
360
- return json.loads(json_str)
361
- else:
362
- # If no JSON found, return the raw response
363
- return {'raw_response': response_text, 'parsing_error': 'No JSON found'}
364
-
365
- except json.JSONDecodeError as e:
366
- return {'raw_response': response_text, 'parsing_error': f'JSON decode error: {str(e)}'}
367
- except Exception as e:
368
- return {'raw_response': response_text, 'parsing_error': f'Unexpected error: {str(e)}'}
369
-
370
- def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
371
- """Calculate distribution of scores for insights"""
372
- if not scores:
373
- return {}
374
-
375
- score_values = list(scores.values())
376
-
377
- return {
378
- 'highest_score': max(score_values),
379
- 'lowest_score': min(score_values),
380
- 'average_score': sum(score_values) / len(score_values),
381
- 'score_range': max(score_values) - min(score_values),
382
- 'scores_above_7': len([s for s in score_values if s >= 7.0]),
383
- 'scores_below_5': len([s for s in score_values if s < 5.0])
384
- }
385
-
386
- def _generate_executive_summary(self, analysis_results: Dict[str, Any]) -> str:
387
- """Generate executive summary based on analysis results"""
388
- overall_score = analysis_results.get('overall_score', 0)
389
- pages_analyzed = analysis_results.get('pages_analyzed', 0)
390
-
391
- if overall_score >= 8.0:
392
- performance = "excellent"
393
- elif overall_score >= 6.5:
394
- performance = "good"
395
- elif overall_score >= 5.0:
396
- performance = "moderate"
397
- else:
398
- performance = "needs improvement"
399
-
400
- return f"Analysis of {pages_analyzed} pages shows {performance} GEO performance with an overall score of {overall_score:.1f}/10. Key opportunities exist in {analysis_results.get('lowest_performing_metric', {}).get('metric', 'multiple areas')}."
401
-
402
- def _generate_performance_insights(self, analysis_results: Dict[str, Any]) -> List[str]:
403
- """Generate performance insights based on analysis"""
404
- insights = []
405
-
406
- best_metric = analysis_results.get('best_performing_metric', {})
407
- worst_metric = analysis_results.get('lowest_performing_metric', {})
408
-
409
- if best_metric.get('score', 0) >= 8.0:
410
- insights.append(f"Strong performance in {best_metric.get('metric', 'unknown')} (score: {best_metric.get('score', 0):.1f})")
411
-
412
- if worst_metric.get('score', 10) < 6.0:
413
- insights.append(f"Significant improvement needed in {worst_metric.get('metric', 'unknown')} (score: {worst_metric.get('score', 0):.1f})")
414
-
415
- score_dist = analysis_results.get('score_distribution', {})
416
- if score_dist.get('score_range', 0) > 3.0:
417
- insights.append("High variability in scores indicates inconsistent optimization across metrics")
418
-
419
- return insights
420
-
421
- def _prioritize_recommendations(self, recommendations: List[str]) -> List[Dict[str, Any]]:
422
- """Prioritize recommendations based on impact potential"""
423
- prioritized = []
424
-
425
- # Simple prioritization based on keywords
426
- high_impact_keywords = ['semantic', 'structure', 'authority', 'factual']
427
- medium_impact_keywords = ['readability', 'clarity', 'format']
428
-
429
- for i, rec in enumerate(recommendations):
430
- priority = 'low'
431
- if any(keyword in rec.lower() for keyword in high_impact_keywords):
432
- priority = 'high'
433
- elif any(keyword in rec.lower() for keyword in medium_impact_keywords):
434
- priority = 'medium'
435
-
436
- prioritized.append({
437
- 'recommendation': rec,
438
- 'priority': priority,
439
- 'order': i + 1
440
- })
441
-
442
- # Sort by priority
443
- priority_order = {'high': 1, 'medium': 2, 'low': 3}
444
- prioritized.sort(key=lambda x: priority_order[x['priority']])
445
-
446
- return prioritized
447
-
448
- def _create_optimization_roadmap(self, analysis_results: Dict[str, Any]) -> Dict[str, List[str]]:
449
- """Create a phased optimization roadmap"""
450
- roadmap = {
451
- 'immediate_actions': [],
452
- 'short_term_goals': [],
453
- 'long_term_strategy': []
454
- }
455
-
456
- overall_score = analysis_results.get('overall_score', 0)
457
- worst_metric = analysis_results.get('lowest_performing_metric', {})
458
-
459
- # Immediate actions based on worst performing metric
460
- if worst_metric.get('score', 10) < 5.0:
461
- roadmap['immediate_actions'].append(f"Address critical issues in {worst_metric.get('metric', 'low-scoring areas')}")
462
-
463
- # Short-term goals
464
- if overall_score < 7.0:
465
- roadmap['short_term_goals'].append("Improve overall GEO score to above 7.0")
466
- roadmap['short_term_goals'].append("Enhance content structure and semantic richness")
467
-
468
- # Long-term strategy
469
- roadmap['long_term_strategy'].append("Establish consistent GEO optimization process")
470
- roadmap['long_term_strategy'].append("Monitor and track AI search performance")
471
-
472
- return roadmap
473
-
474
- def _assess_competitive_position(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
475
- """Assess competitive position based on scores"""
476
- overall_score = analysis_results.get('overall_score', 0)
477
-
478
- if overall_score >= 8.5:
479
- position = "market_leader"
480
- description = "Content is highly optimized for AI search engines"
481
- elif overall_score >= 7.0:
482
- position = "competitive"
483
- description = "Content performs well but has room for improvement"
484
- elif overall_score >= 5.5:
485
- position = "average"
486
- description = "Content meets basic standards but lacks optimization"
487
- else:
488
- position = "needs_work"
489
- description = "Content requires significant optimization for AI search"
490
-
491
- return {
492
- 'position': position,
493
- 'description': description,
494
- 'score': overall_score,
495
- 'percentile_estimate': min(overall_score * 10, 100) # Rough percentile estimate
496
- }
497
-
498
- def _get_timestamp(self) -> str:
499
- """Get current timestamp"""
500
- from datetime import datetime
501
  return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 
1
+ """
2
+ GEO Scoring Module
3
+ Analyzes content for Generative Engine Optimization (GEO) performance
4
+ """
5
+
6
+ import json
7
+ from typing import Dict, Any, List
8
+ from langchain.prompts import ChatPromptTemplate
9
+
10
+
11
+ class GEOScorer:
12
+ """Main class for calculating GEO scores and analysis"""
13
+
14
+ def __init__(self, llm):
15
+ self.llm = llm
16
+ self.setup_prompts()
17
+
18
+ def setup_prompts(self):
19
+ """Initialize prompts for different types of analysis"""
20
+
21
+ # Main GEO analysis prompt
22
+ self.geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided content for its effectiveness in AI-powered search engines and LLM systems.
23
+
24
+ Evaluate the content based on these GEO criteria (score 1-10 each):
25
+
26
+ 1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
27
+ 2. **Query Intent Matching**: How well does the content match common user queries?
28
+ 3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
29
+ 4. **Conversational Readiness**: How suitable is the content for AI chat responses?
30
+ 5. **Semantic Richness**: How well does the content use relevant semantic keywords?
31
+ 6. **Context Completeness**: Does the content provide complete, self-contained answers?
32
+ 7. **Citation Worthiness**: How likely are AI systems to cite this content?
33
+ 8. **Multi-Query Coverage**: Does the content answer multiple related questions?
34
+
35
+ Also identify:
36
+ - Primary topics and entities
37
+ - Missing information gaps
38
+ - Optimization opportunities
39
+ - Specific enhancement recommendations
40
+
41
+ Format your response as JSON:
42
+
43
+ ```json
44
+ {
45
+ "geo_scores": {
46
+ "ai_search_visibility": 7.5,
47
+ "query_intent_matching": 8.0,
48
+ "factual_accuracy": 9.0,
49
+ "conversational_readiness": 6.5,
50
+ "semantic_richness": 7.0,
51
+ "context_completeness": 8.5,
52
+ "citation_worthiness": 7.8,
53
+ "multi_query_coverage": 6.0
54
+ },
55
+ "overall_geo_score": 7.5,
56
+ "primary_topics": ["topic1", "topic2"],
57
+ "entities": ["entity1", "entity2"],
58
+ "missing_gaps": ["gap1", "gap2"],
59
+ "optimization_opportunities": [
60
+ {
61
+ "type": "semantic_enhancement",
62
+ "description": "Add more related terms",
63
+ "priority": "high"
64
+ }
65
+ ],
66
+ "recommendations": [
67
+ "Specific actionable recommendation 1",
68
+ "Specific actionable recommendation 2"
69
+ ]
70
+ }
71
+ ```"""
72
+
73
+ # Quick scoring prompt for faster analysis
74
+ self.quick_score_prompt = """Analyze this content for AI search optimization. Provide scores (1-10) for:
75
+
76
+ 1. AI Search Visibility
77
+ 2. Query Intent Matching
78
+ 3. Conversational Readiness
79
+ 4. Citation Worthiness
80
+
81
+ Respond in JSON format:
82
+ ```json
83
+ {
84
+ "scores": {
85
+ "ai_search_visibility": 7.5,
86
+ "query_intent_matching": 8.0,
87
+ "conversational_readiness": 6.5,
88
+ "citation_worthiness": 7.8
89
+ },
90
+ "overall_score": 7.5,
91
+ "top_recommendation": "Most important improvement needed"
92
+ }
93
+ ```"""
94
+
95
+ # Competitive analysis prompt
96
+ self.competitive_prompt = """Compare these content pieces for GEO performance. Identify which performs better for AI search and why.
97
+
98
+ Content A: {content_a}
99
+
100
+ Content B: {content_b}
101
+
102
+ Provide analysis in JSON:
103
+ ```json
104
+ {
105
+ "winner": "A" or "B",
106
+ "score_comparison": {
107
+ "content_a_score": 7.5,
108
+ "content_b_score": 8.2
109
+ },
110
+ "key_differences": ["difference1", "difference2"],
111
+ "improvement_suggestions": {
112
+ "content_a": ["suggestion1"],
113
+ "content_b": ["suggestion1"]
114
+ }
115
+ }
116
+ ```"""
117
+
118
+ def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
119
+ """
120
+ Analyze a single page for GEO performance
121
+ """
122
+ try:
123
+ # Choose prompt based on detail level
124
+ if detailed:
125
+ system_prompt = self.geo_analysis_prompt
126
+ user_message = f"Title: {title}\n\nContent: {content[:8000]}"
127
+ else:
128
+ system_prompt = self.quick_score_prompt
129
+ user_message = f"Title: {title}\n\nContent: {content[:4000]}"
130
+
131
+ # Build prompt and run analysis
132
+ prompt_template = ChatPromptTemplate.from_messages([
133
+ ("system", system_prompt),
134
+ ("user", user_message)
135
+ ])
136
+ chain = prompt_template | self.llm
137
+ result = chain.invoke({}) # No variables needed
138
+
139
+ # Extract and parse result
140
+ result_content = result.content if hasattr(result, 'content') else str(result)
141
+ parsed_result = self._parse_llm_response(result_content)
142
+
143
+ # Add metadata
144
+ parsed_result.update({
145
+ 'analyzed_title': title,
146
+ 'content_length': len(content),
147
+ 'word_count': len(content.split()),
148
+ 'analysis_type': 'detailed' if detailed else 'quick'
149
+ })
150
+
151
+ return parsed_result
152
+
153
+ except Exception as e:
154
+ return {'error': f"GEO analysis failed: {str(e)}"}
155
+
156
+ def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
157
+ """
158
+ Analyze multiple pages and return consolidated results
159
+
160
+ Args:
161
+ pages_data (List[Dict]): List of page data with content and metadata
162
+ detailed (bool): Whether to perform detailed analysis
163
+
164
+ Returns:
165
+ List[Dict]: List of GEO analysis results
166
+ """
167
+ results = []
168
+
169
+ for i, page_data in enumerate(pages_data):
170
+ try:
171
+ content = page_data.get('content', '')
172
+ title = page_data.get('title', f'Page {i+1}')
173
+
174
+ analysis = self.analyze_page_geo(content, title, detailed)
175
+
176
+ # Add page-specific metadata
177
+ analysis.update({
178
+ 'page_url': page_data.get('url', ''),
179
+ 'page_index': i,
180
+ 'source_word_count': page_data.get('word_count', 0)
181
+ })
182
+
183
+ results.append(analysis)
184
+
185
+ except Exception as e:
186
+ results.append({
187
+ 'page_index': i,
188
+ 'page_url': page_data.get('url', ''),
189
+ 'error': f"Analysis failed: {str(e)}"
190
+ })
191
+
192
+ return results
193
+
194
+ def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
195
+ """
196
+ Compare two pieces of content for GEO performance
197
+
198
+ Args:
199
+ content_a (str): First content to compare
200
+ content_b (str): Second content to compare
201
+ titles (tuple): Optional titles for the content pieces
202
+
203
+ Returns:
204
+ Dict: Comparison analysis results
205
+ """
206
+ try:
207
+ title_a, title_b = titles if titles else ("Content A", "Content B")
208
+
209
+ prompt_template = ChatPromptTemplate.from_messages([
210
+ ("system", self.competitive_prompt),
211
+ ("user", "")
212
+ ])
213
+
214
+ # Format the competitive analysis prompt
215
+ formatted_prompt = self.competitive_prompt.format(
216
+ content_a=f"Title: {title_a}\nContent: {content_a[:4000]}",
217
+ content_b=f"Title: {title_b}\nContent: {content_b[:4000]}"
218
+ )
219
+
220
+ chain = ChatPromptTemplate.from_messages([
221
+ ("system", formatted_prompt),
222
+ ("user", "Perform the comparison analysis.")
223
+ ]) | self.llm
224
+
225
+ result = chain.invoke({})
226
+ result_content = result.content if hasattr(result, 'content') else str(result)
227
+
228
+ return self._parse_llm_response(result_content)
229
+
230
+ except Exception as e:
231
+ return {'error': f"Comparison analysis failed: {str(e)}"}
232
+
233
+ def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
234
+ """
235
+ Calculate aggregate GEO scores from multiple page analyses
236
+
237
+ Args:
238
+ individual_results (List[Dict]): List of individual page analysis results
239
+
240
+ Returns:
241
+ Dict: Aggregate scores and insights
242
+ """
243
+ try:
244
+ valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
245
+
246
+ if not valid_results:
247
+ return {'error': 'No valid results to aggregate'}
248
+
249
+ # Calculate average scores
250
+ score_keys = list(valid_results[0]['geo_scores'].keys())
251
+ avg_scores = {}
252
+
253
+ for key in score_keys:
254
+ scores = [r['geo_scores'][key] for r in valid_results if key in r['geo_scores']]
255
+ avg_scores[key] = sum(scores) / len(scores) if scores else 0
256
+
257
+ overall_avg = sum(avg_scores.values()) / len(avg_scores) if avg_scores else 0
258
+
259
+ # Collect all recommendations and opportunities
260
+ all_recommendations = []
261
+ all_opportunities = []
262
+ all_topics = []
263
+ all_entities = []
264
+
265
+ for result in valid_results:
266
+ all_recommendations.extend(result.get('recommendations', []))
267
+ all_opportunities.extend(result.get('optimization_opportunities', []))
268
+ all_topics.extend(result.get('primary_topics', []))
269
+ all_entities.extend(result.get('entities', []))
270
+
271
+ # Remove duplicates and prioritize
272
+ unique_recommendations = list(set(all_recommendations))
273
+ unique_topics = list(set(all_topics))
274
+ unique_entities = list(set(all_entities))
275
+
276
+ # Find highest and lowest performing areas
277
+ best_score = max(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
278
+ worst_score = min(avg_scores.items(), key=lambda x: x[1]) if avg_scores else ('none', 0)
279
+
280
+ return {
281
+ 'aggregate_scores': avg_scores,
282
+ 'overall_score': overall_avg,
283
+ 'pages_analyzed': len(valid_results),
284
+ 'best_performing_metric': {
285
+ 'metric': best_score[0],
286
+ 'score': best_score[1]
287
+ },
288
+ 'lowest_performing_metric': {
289
+ 'metric': worst_score[0],
290
+ 'score': worst_score[1]
291
+ },
292
+ 'consolidated_recommendations': unique_recommendations[:10],
293
+ 'all_topics': unique_topics,
294
+ 'all_entities': unique_entities,
295
+ 'high_priority_opportunities': [
296
+ opp for opp in all_opportunities
297
+ if opp.get('priority') == 'high'
298
+ ][:5],
299
+ 'score_distribution': self._calculate_score_distribution(avg_scores)
300
+ }
301
+
302
+ except Exception as e:
303
+ return {'error': f"Aggregation failed: {str(e)}"}
304
+
305
+ def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
306
+ """
307
+ Generate a comprehensive GEO report
308
+
309
+ Args:
310
+ analysis_results (Dict): Results from aggregate analysis
311
+ website_url (str): Optional website URL for context
312
+
313
+ Returns:
314
+ Dict: Comprehensive GEO report
315
+ """
316
+ try:
317
+ report = {
318
+ 'report_metadata': {
319
+ 'generated_at': self._get_timestamp(),
320
+ 'website_url': website_url,
321
+ 'analysis_type': 'GEO Performance Report'
322
+ },
323
+ 'executive_summary': self._generate_executive_summary(analysis_results),
324
+ 'detailed_scores': analysis_results.get('aggregate_scores', {}),
325
+ 'performance_insights': self._generate_performance_insights(analysis_results),
326
+ 'actionable_recommendations': self._prioritize_recommendations(
327
+ analysis_results.get('consolidated_recommendations', [])
328
+ ),
329
+ 'optimization_roadmap': self._create_optimization_roadmap(analysis_results),
330
+ 'competitive_position': self._assess_competitive_position(analysis_results),
331
+ 'technical_details': {
332
+ 'pages_analyzed': analysis_results.get('pages_analyzed', 0),
333
+ 'overall_score': analysis_results.get('overall_score', 0),
334
+ 'score_distribution': analysis_results.get('score_distribution', {})
335
+ }
336
+ }
337
+
338
+ return report
339
+
340
+ except Exception as e:
341
+ return {'error': f"Report generation failed: {str(e)}"}
342
+
343
+ def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
344
+ """Parse LLM response and extract JSON content"""
345
+ try:
346
+ # Find JSON content in the response
347
+ json_start = response_text.find('{')
348
+ json_end = response_text.rfind('}') + 1
349
+
350
+ if json_start != -1 and json_end != -1:
351
+ json_str = response_text[json_start:json_end]
352
+ return json.loads(json_str)
353
+ else:
354
+ # If no JSON found, return the raw response
355
+ return {'raw_response': response_text, 'parsing_error': 'No JSON found'}
356
+
357
+ except json.JSONDecodeError as e:
358
+ return {'raw_response': response_text, 'parsing_error': f'JSON decode error: {str(e)}'}
359
+ except Exception as e:
360
+ return {'raw_response': response_text, 'parsing_error': f'Unexpected error: {str(e)}'}
361
+
362
+ def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
363
+ """Calculate distribution of scores for insights"""
364
+ if not scores:
365
+ return {}
366
+
367
+ score_values = list(scores.values())
368
+
369
+ return {
370
+ 'highest_score': max(score_values),
371
+ 'lowest_score': min(score_values),
372
+ 'average_score': sum(score_values) / len(score_values),
373
+ 'score_range': max(score_values) - min(score_values),
374
+ 'scores_above_7': len([s for s in score_values if s >= 7.0]),
375
+ 'scores_below_5': len([s for s in score_values if s < 5.0])
376
+ }
377
+
378
+ def _generate_executive_summary(self, analysis_results: Dict[str, Any]) -> str:
379
+ """Generate executive summary based on analysis results"""
380
+ overall_score = analysis_results.get('overall_score', 0)
381
+ pages_analyzed = analysis_results.get('pages_analyzed', 0)
382
+
383
+ if overall_score >= 8.0:
384
+ performance = "excellent"
385
+ elif overall_score >= 6.5:
386
+ performance = "good"
387
+ elif overall_score >= 5.0:
388
+ performance = "moderate"
389
+ else:
390
+ performance = "needs improvement"
391
+
392
+ return f"Analysis of {pages_analyzed} pages shows {performance} GEO performance with an overall score of {overall_score:.1f}/10. Key opportunities exist in {analysis_results.get('lowest_performing_metric', {}).get('metric', 'multiple areas')}."
393
+
394
+ def _generate_performance_insights(self, analysis_results: Dict[str, Any]) -> List[str]:
395
+ """Generate performance insights based on analysis"""
396
+ insights = []
397
+
398
+ best_metric = analysis_results.get('best_performing_metric', {})
399
+ worst_metric = analysis_results.get('lowest_performing_metric', {})
400
+
401
+ if best_metric.get('score', 0) >= 8.0:
402
+ insights.append(f"Strong performance in {best_metric.get('metric', 'unknown')} (score: {best_metric.get('score', 0):.1f})")
403
+
404
+ if worst_metric.get('score', 10) < 6.0:
405
+ insights.append(f"Significant improvement needed in {worst_metric.get('metric', 'unknown')} (score: {worst_metric.get('score', 0):.1f})")
406
+
407
+ score_dist = analysis_results.get('score_distribution', {})
408
+ if score_dist.get('score_range', 0) > 3.0:
409
+ insights.append("High variability in scores indicates inconsistent optimization across metrics")
410
+
411
+ return insights
412
+
413
+ def _prioritize_recommendations(self, recommendations: List[str]) -> List[Dict[str, Any]]:
414
+ """Prioritize recommendations based on impact potential"""
415
+ prioritized = []
416
+
417
+ # Simple prioritization based on keywords
418
+ high_impact_keywords = ['semantic', 'structure', 'authority', 'factual']
419
+ medium_impact_keywords = ['readability', 'clarity', 'format']
420
+
421
+ for i, rec in enumerate(recommendations):
422
+ priority = 'low'
423
+ if any(keyword in rec.lower() for keyword in high_impact_keywords):
424
+ priority = 'high'
425
+ elif any(keyword in rec.lower() for keyword in medium_impact_keywords):
426
+ priority = 'medium'
427
+
428
+ prioritized.append({
429
+ 'recommendation': rec,
430
+ 'priority': priority,
431
+ 'order': i + 1
432
+ })
433
+
434
+ # Sort by priority
435
+ priority_order = {'high': 1, 'medium': 2, 'low': 3}
436
+ prioritized.sort(key=lambda x: priority_order[x['priority']])
437
+
438
+ return prioritized
439
+
440
+ def _create_optimization_roadmap(self, analysis_results: Dict[str, Any]) -> Dict[str, List[str]]:
441
+ """Create a phased optimization roadmap"""
442
+ roadmap = {
443
+ 'immediate_actions': [],
444
+ 'short_term_goals': [],
445
+ 'long_term_strategy': []
446
+ }
447
+
448
+ overall_score = analysis_results.get('overall_score', 0)
449
+ worst_metric = analysis_results.get('lowest_performing_metric', {})
450
+
451
+ # Immediate actions based on worst performing metric
452
+ if worst_metric.get('score', 10) < 5.0:
453
+ roadmap['immediate_actions'].append(f"Address critical issues in {worst_metric.get('metric', 'low-scoring areas')}")
454
+
455
+ # Short-term goals
456
+ if overall_score < 7.0:
457
+ roadmap['short_term_goals'].append("Improve overall GEO score to above 7.0")
458
+ roadmap['short_term_goals'].append("Enhance content structure and semantic richness")
459
+
460
+ # Long-term strategy
461
+ roadmap['long_term_strategy'].append("Establish consistent GEO optimization process")
462
+ roadmap['long_term_strategy'].append("Monitor and track AI search performance")
463
+
464
+ return roadmap
465
+
466
+ def _assess_competitive_position(self, analysis_results: Dict[str, Any]) -> Dict[str, Any]:
467
+ """Assess competitive position based on scores"""
468
+ overall_score = analysis_results.get('overall_score', 0)
469
+
470
+ if overall_score >= 8.5:
471
+ position = "market_leader"
472
+ description = "Content is highly optimized for AI search engines"
473
+ elif overall_score >= 7.0:
474
+ position = "competitive"
475
+ description = "Content performs well but has room for improvement"
476
+ elif overall_score >= 5.5:
477
+ position = "average"
478
+ description = "Content meets basic standards but lacks optimization"
479
+ else:
480
+ position = "needs_work"
481
+ description = "Content requires significant optimization for AI search"
482
+
483
+ return {
484
+ 'position': position,
485
+ 'description': description,
486
+ 'score': overall_score,
487
+ 'percentile_estimate': min(overall_score * 10, 100) # Rough percentile estimate
488
+ }
489
+
490
+ def _get_timestamp(self) -> str:
491
+ """Get current timestamp"""
492
+ from datetime import datetime
 
 
 
 
 
 
 
 
493
  return datetime.now().strftime('%Y-%m-%d %H:%M:%S')