MBilal-72 commited on
Commit
994eab4
·
verified ·
1 Parent(s): c34a608

update utils/scrorer.py with prompts

Browse files
Files changed (1) hide show
  1. utils/scorer.py +76 -326
utils/scorer.py CHANGED
@@ -1,240 +1,57 @@
1
  """
2
- Fixed GEO Scoring Module - Drop-in replacement for your original
3
- This version fixes the data format issues while keeping your existing structure
4
  """
5
 
6
  import json
7
- import re
8
- import logging
9
- from typing import Dict, Any, List, Union, Optional
10
- from datetime import datetime
11
  from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
12
 
13
 
14
  class GEOScorer:
15
- """Main class for calculating GEO scores and analysis - IMPROVED VERSION"""
16
 
17
- def __init__(self, llm, logger=None):
18
  self.llm = llm
19
- self.logger = logger or self._setup_logger()
20
  self.setup_prompts()
21
 
22
- def _setup_logger(self):
23
- """Setup default logger"""
24
- logger = logging.getLogger(__name__)
25
- if not logger.handlers:
26
- handler = logging.StreamHandler()
27
- formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
28
- handler.setFormatter(formatter)
29
- logger.addHandler(handler)
30
- logger.setLevel(logging.INFO)
31
- return logger
32
-
33
  def setup_prompts(self):
34
  """Initialize prompts for different types of analysis"""
35
 
36
  # Main GEO analysis prompt
37
- self.geo_analysis_prompt = """You are a Generative Engine Optimizer (GEO) specialist. Analyze the provided content for its effectiveness in AI-powered search engines and LLM systems.
38
-
39
- Evaluate the content based on these GEO criteria (score 1-10 each):
40
-
41
- 1. **AI Search Visibility**: How likely is this content to be surfaced by AI search engines?
42
- 2. **Query Intent Matching**: How well does the content match common user queries?
43
- 3. **Factual Accuracy & Authority**: How trustworthy and authoritative is the information?
44
- 4. **Conversational Readiness**: How suitable is the content for AI chat responses?
45
- 5. **Semantic Richness**: How well does the content use relevant semantic keywords?
46
- 6. **Context Completeness**: Does the content provide complete, self-contained answers?
47
- 7. **Citation Worthiness**: How likely are AI systems to cite this content?
48
- 8. **Multi-Query Coverage**: Does the content answer multiple related questions?
49
-
50
- Also identify:
51
- - Primary topics and entities
52
- - Missing information gaps
53
- - Optimization opportunities
54
- - Specific enhancement recommendations
55
-
56
- IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after the JSON.
57
 
58
- {
59
- "geo_scores": {
60
- "ai_search_visibility": 7.5,
61
- "query_intent_matching": 8.0,
62
- "factual_accuracy": 9.0,
63
- "conversational_readiness": 6.5,
64
- "semantic_richness": 7.0,
65
- "context_completeness": 8.5,
66
- "citation_worthiness": 7.8,
67
- "multi_query_coverage": 6.0
68
- },
69
- "overall_geo_score": 7.5,
70
- "primary_topics": ["topic1", "topic2"],
71
- "entities": ["entity1", "entity2"],
72
- "missing_gaps": ["gap1", "gap2"],
73
- "optimization_opportunities": [
74
- {
75
- "type": "semantic_enhancement",
76
- "description": "Add more related terms",
77
- "priority": "high"
78
- }
79
- ],
80
- "recommendations": [
81
- "Specific actionable recommendation 1",
82
- "Specific actionable recommendation 2"
83
- ]
84
- }"""
85
 
86
  # Quick scoring prompt for faster analysis
87
- self.quick_score_prompt = """Analyze this content for AI search optimization. Provide scores (1-10) for:
88
 
89
- 1. AI Search Visibility
90
- 2. Query Intent Matching
91
- 3. Conversational Readiness
92
- 4. Citation Worthiness
93
-
94
- IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after the JSON.
95
-
96
- {
97
- "scores": {
98
- "ai_search_visibility": 7.5,
99
- "query_intent_matching": 8.0,
100
- "conversational_readiness": 6.5,
101
- "citation_worthiness": 7.8
102
- },
103
- "overall_score": 7.5,
104
- "top_recommendation": "Most important improvement needed"
105
- }"""
106
 
107
  # Competitive analysis prompt
108
- self.competitive_prompt = """Compare these content pieces for GEO performance. Identify which performs better for AI search and why.
109
-
110
- Content A: {content_a}
111
-
112
- Content B: {content_b}
113
-
114
- IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after the JSON.
115
-
116
- {
117
- "winner": "A",
118
- "score_comparison": {
119
- "content_a_score": 7.5,
120
- "content_b_score": 8.2
121
- },
122
- "key_differences": ["difference1", "difference2"],
123
- "improvement_suggestions": {
124
- "content_a": ["suggestion1"],
125
- "content_b": ["suggestion1"]
126
- }
127
- }"""
128
-
129
- def _normalize_page_data(self, page_data):
130
- """
131
- FIXED: Normalize different data formats from web scrapers
132
- This handles the 'content' key error you were seeing
133
- """
134
- if not isinstance(page_data, dict):
135
- self.logger.warning(f"Expected dict, got {type(page_data)}")
136
- return None
137
-
138
- # Try different field names for content
139
- content_fields = ['content', 'text', 'body', 'html_content', 'page_content', 'main_content']
140
- content = ""
141
-
142
- for field in content_fields:
143
- if field in page_data and page_data[field]:
144
- content = str(page_data[field])
145
- break
146
-
147
- if not content:
148
- self.logger.warning(f"No content found in page data. Available keys: {list(page_data.keys())}")
149
- return None
150
-
151
- # Try different field names for title
152
- title_fields = ['title', 'page_title', 'heading', 'h1', 'name']
153
- title = "Untitled Page"
154
-
155
- for field in title_fields:
156
- if field in page_data and page_data[field]:
157
- title = str(page_data[field])
158
- break
159
-
160
- # Try different field names for URL
161
- url_fields = ['url', 'link', 'page_url', 'source_url', 'href']
162
- url = ""
163
-
164
- for field in url_fields:
165
- if field in page_data and page_data[field]:
166
- url = str(page_data[field])
167
- break
168
-
169
- return {
170
- 'content': content,
171
- 'title': title,
172
- 'url': url,
173
- 'word_count': len(content.split()) if content else 0
174
- }
175
-
176
- def _sanitize_content(self, content):
177
- """Basic content sanitization"""
178
- if not content:
179
- return ""
180
-
181
- # Remove potential prompt injection patterns
182
- dangerous_patterns = [
183
- r'ignore\s+previous\s+instructions',
184
- r'system\s*:',
185
- r'assistant\s*:',
186
- ]
187
-
188
- sanitized = content
189
- for pattern in dangerous_patterns:
190
- sanitized = re.sub(pattern, '[FILTERED]', sanitized, flags=re.IGNORECASE)
191
-
192
- return sanitized[:8000] # Limit length
193
 
194
  def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
195
  """
196
  Analyze a single page for GEO performance
197
- FIXED: Better error handling and validation
198
  """
199
  try:
200
- # Input validation
201
- if not content or not content.strip():
202
- return {'error': 'Empty or missing content', 'error_type': 'validation'}
203
-
204
- if len(content.strip()) < 50:
205
- return {'error': 'Content too short for analysis', 'error_type': 'validation'}
206
-
207
- # Sanitize content
208
- sanitized_content = self._sanitize_content(content)
209
-
210
  # Choose prompt based on detail level
211
  if detailed:
212
  system_prompt = self.geo_analysis_prompt
213
- max_length = 8000
214
  else:
215
  system_prompt = self.quick_score_prompt
216
- max_length = 4000
217
-
218
- # Smart truncation
219
- if len(sanitized_content) > max_length:
220
- truncated = sanitized_content[:max_length]
221
- # Try to end at a sentence
222
- last_period = truncated.rfind('. ')
223
- if last_period > max_length * 0.8:
224
- sanitized_content = truncated[:last_period + 1]
225
- else:
226
- sanitized_content = truncated + "..."
227
-
228
- user_message = f"Title: {title}\n\nContent: {sanitized_content}"
229
 
230
  # Build prompt and run analysis
231
  prompt_template = ChatPromptTemplate.from_messages([
232
  SystemMessagePromptTemplate.from_template(system_prompt),
233
  HumanMessagePromptTemplate.from_template(user_message)
234
  ])
235
-
 
236
  chain = prompt_template | self.llm
237
- result = chain.invoke({})
238
 
239
  # Extract and parse result
240
  result_content = result.content if hasattr(result, 'content') else str(result)
@@ -250,80 +67,66 @@ IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after
250
 
251
  return parsed_result
252
 
253
- except json.JSONDecodeError as e:
254
- self.logger.error(f"JSON parsing failed for '{title}': {e}")
255
- return {'error': 'Invalid response format from LLM', 'error_type': 'parsing'}
256
  except Exception as e:
257
- self.logger.error(f"Analysis failed for '{title}': {e}")
258
- return {'error': f"Analysis failed: {str(e)}", 'error_type': 'system'}
259
 
260
  def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
261
  """
262
- FIXED: Analyze multiple pages with automatic data normalization
263
- This handles different data formats from web scrapers
264
- """
265
- if not pages_data:
266
- self.logger.error("No pages data provided")
267
- return [{'error': 'No pages data provided', 'error_type': 'validation'}]
268
 
 
 
 
 
 
 
 
269
  results = []
270
- successful_analyses = 0
271
-
272
- self.logger.info(f"Starting analysis of {len(pages_data)} pages")
273
 
274
  for i, page_data in enumerate(pages_data):
275
  try:
276
- # FIXED: Normalize the data format
277
- normalized_page = self._normalize_page_data(page_data)
278
-
279
- if not normalized_page:
280
- self.logger.warning(f"Page {i}: Could not extract content. Available keys: {list(page_data.keys()) if isinstance(page_data, dict) else 'Not a dict'}")
281
- results.append({
282
- 'page_index': i,
283
- 'error': 'Could not extract content from page data',
284
- 'error_type': 'data_format',
285
- 'available_keys': list(page_data.keys()) if isinstance(page_data, dict) else None
286
- })
287
- continue
288
-
289
- content = normalized_page['content']
290
- title = normalized_page['title']
291
 
292
  analysis = self.analyze_page_geo(content, title, detailed)
293
 
294
  # Add page-specific metadata
295
  analysis.update({
296
- 'page_url': normalized_page.get('url', ''),
297
  'page_index': i,
298
- 'source_word_count': normalized_page.get('word_count', 0)
299
  })
300
 
301
- if 'error' not in analysis:
302
- successful_analyses += 1
303
-
304
  results.append(analysis)
305
 
306
  except Exception as e:
307
- self.logger.error(f"Failed to analyze page {i}: {e}")
308
  results.append({
309
  'page_index': i,
310
- 'error': f"Analysis failed: {str(e)}",
311
- 'error_type': 'system'
312
  })
313
 
314
- self.logger.info(f"Completed analysis: {successful_analyses}/{len(pages_data)} successful")
315
  return results
316
 
317
  def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
318
  """
319
  Compare two pieces of content for GEO performance
 
 
 
 
 
 
 
 
320
  """
321
  try:
322
  title_a, title_b = titles if titles else ("Content A", "Content B")
323
 
324
- # Sanitize content
325
- content_a = self._sanitize_content(content_a)
326
- content_b = self._sanitize_content(content_b)
 
327
 
328
  # Format the competitive analysis prompt
329
  formatted_prompt = self.competitive_prompt.format(
@@ -342,31 +145,23 @@ IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after
342
  return self._parse_llm_response(result_content)
343
 
344
  except Exception as e:
345
- self.logger.error(f"Comparison analysis failed: {e}")
346
- return {'error': f"Comparison analysis failed: {str(e)}", 'error_type': 'system'}
347
 
348
  def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
349
  """
350
  Calculate aggregate GEO scores from multiple page analyses
351
- FIXED: Better error handling for missing data
 
 
 
 
 
352
  """
353
  try:
354
  valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
355
- error_results = [r for r in individual_results if r.get('error')]
356
 
357
  if not valid_results:
358
- error_summary = {}
359
- for result in error_results:
360
- error_type = result.get('error_type', 'unknown')
361
- error_summary[error_type] = error_summary.get(error_type, 0) + 1
362
-
363
- return {
364
- 'error': 'No valid results to aggregate',
365
- 'error_type': 'no_data',
366
- 'total_pages': len(individual_results),
367
- 'error_breakdown': error_summary,
368
- 'sample_errors': [r.get('error', 'Unknown error') for r in error_results[:3]]
369
- }
370
 
371
  # Calculate average scores
372
  score_keys = list(valid_results[0]['geo_scores'].keys())
@@ -390,7 +185,7 @@ IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after
390
  all_topics.extend(result.get('primary_topics', []))
391
  all_entities.extend(result.get('entities', []))
392
 
393
- # Remove duplicates
394
  unique_recommendations = list(set(all_recommendations))
395
  unique_topics = list(set(all_topics))
396
  unique_entities = list(set(all_entities))
@@ -403,8 +198,6 @@ IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after
403
  'aggregate_scores': avg_scores,
404
  'overall_score': overall_avg,
405
  'pages_analyzed': len(valid_results),
406
- 'pages_with_errors': len(error_results),
407
- 'success_rate': len(valid_results) / len(individual_results) if individual_results else 0,
408
  'best_performing_metric': {
409
  'metric': best_score[0],
410
  'score': best_score[1]
@@ -418,18 +211,24 @@ IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after
418
  'all_entities': unique_entities,
419
  'high_priority_opportunities': [
420
  opp for opp in all_opportunities
421
- if isinstance(opp, dict) and opp.get('priority') == 'high'
422
  ][:5],
423
  'score_distribution': self._calculate_score_distribution(avg_scores)
424
  }
425
 
426
  except Exception as e:
427
- self.logger.error(f"Aggregation failed: {e}")
428
- return {'error': f"Aggregation failed: {str(e)}", 'error_type': 'system'}
429
 
430
  def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
431
  """
432
  Generate a comprehensive GEO report
 
 
 
 
 
 
 
433
  """
434
  try:
435
  report = {
@@ -456,50 +255,26 @@ IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after
456
  return report
457
 
458
  except Exception as e:
459
- self.logger.error(f"Report generation failed: {e}")
460
- return {'error': f"Report generation failed: {str(e)}", 'error_type': 'system'}
461
 
462
  def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
463
- """FIXED: Enhanced LLM response parsing"""
464
  try:
465
- # Clean response text
466
- cleaned_response = response_text.strip()
467
-
468
- # Try to find JSON content with multiple patterns
469
- json_patterns = [
470
- r'\{.*\}', # Simple JSON object
471
- r'```json\s*(\{.*?\})\s*```', # JSON in code blocks
472
- r'```\s*(\{.*?\})\s*```' # Generic code blocks
473
- ]
474
 
475
- for pattern in json_patterns:
476
- matches = re.findall(pattern, cleaned_response, re.DOTALL)
477
- if matches:
478
- json_str = matches[0] if len(matches) == 1 else matches[0]
479
- try:
480
- return json.loads(json_str)
481
- except json.JSONDecodeError:
482
- continue
483
-
484
- # Try parsing the entire response
485
- try:
486
- return json.loads(cleaned_response)
487
- except json.JSONDecodeError:
488
- pass
489
-
490
- # If all else fails, return structured error
491
- return {
492
- 'raw_response': response_text[:500],
493
- 'parsing_error': 'No valid JSON found in LLM response',
494
- 'error_type': 'parsing'
495
- }
496
 
 
 
497
  except Exception as e:
498
- return {
499
- 'raw_response': response_text[:500],
500
- 'parsing_error': f'Parsing error: {str(e)}',
501
- 'error_type': 'parsing'
502
- }
503
 
504
  def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
505
  """Calculate distribution of scores for insights"""
@@ -626,35 +401,10 @@ IMPORTANT: Respond ONLY with valid JSON. Do not include any text before or after
626
  'position': position,
627
  'description': description,
628
  'score': overall_score,
629
- 'percentile_estimate': min(overall_score * 10, 100)
630
  }
631
 
632
  def _get_timestamp(self) -> str:
633
  """Get current timestamp"""
634
- return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
635
-
636
-
637
- # Debug utility function
638
- def debug_scraped_data_format(scraped_data):
639
- """
640
- Quick debug function to see what your scraper is returning
641
- Add this to your code to debug data format issues
642
- """
643
- print("=== SCRAPED DATA DEBUG ===")
644
- print(f"Data type: {type(scraped_data)}")
645
-
646
- if isinstance(scraped_data, list):
647
- print(f"List length: {len(scraped_data)}")
648
- if scraped_data:
649
- print(f"First item type: {type(scraped_data[0])}")
650
- if isinstance(scraped_data[0], dict):
651
- print(f"First item keys: {list(scraped_data[0].keys())}")
652
- for key, value in list(scraped_data[0].items())[:3]:
653
- print(f" {key}: {str(value)[:100]}...")
654
-
655
- elif isinstance(scraped_data, dict):
656
- print(f"Dict keys: {list(scraped_data.keys())}")
657
- for key, value in list(scraped_data.items())[:3]:
658
- print(f" {key}: {str(value)[:100]}...")
659
-
660
- print("=== END DEBUG ===")
 
1
  """
2
+ GEO Scoring Module
3
+ Analyzes content for Generative Engine Optimization (GEO) performance
4
  """
5
 
6
  import json
7
+ from typing import Dict, Any, List
 
 
 
8
  from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
9
 
10
 
11
  class GEOScorer:
12
+ """Main class for calculating GEO scores and analysis"""
13
 
14
+ def __init__(self, llm):
15
  self.llm = llm
 
16
  self.setup_prompts()
17
 
 
 
 
 
 
 
 
 
 
 
 
18
  def setup_prompts(self):
19
  """Initialize prompts for different types of analysis"""
20
 
21
  # Main GEO analysis prompt
22
+ def setup_prompts(self):
23
+ self.geo_analysis_prompt = "You are a Generative Engine Optimization (GEO) Specialist. Your task is to critically analyze the input content for its effectiveness in AI-powered search engines and large language model (LLM) systems. Evaluate the content using the following GEO criteria, assigning a score from 1 to 10 for each: \n\n1. AI Search Visibility - How likely is the content to be surfaced by AI search engines?\n2. Query Intent Matching - How well does the content align with common user queries?\n3. Factual Accuracy & Authority - How trustworthy and authoritative is the information?\n4. Conversational Readiness - Is the content well-suited for AI chat responses?\n5. Semantic Richness - Does the content effectively use relevant semantic keywords?\n6. Context Completeness - Is the content self-contained and does it provide complete answers?\n7. Citation Worthiness - How likely is the content to be cited by AI systems?\n8. Multi-Query Coverage - Does the content address multiple related questions?\n\nAlso provide:\n- Key topics and entities mentioned\n- Missing information or content gaps\n- Specific optimization opportunities\n- Actionable enhancement recommendations\n\nRespond strictly in JSON format using the structure below (double curly braces shown here to escape string formatting, do NOT include them in actual output):\n\n{{\n \"geo_scores\": {{\n \"ai_search_visibility\": 0.0,\n \"query_intent_matching\": 0.0,\n \"factual_accuracy\": 0.0,\n \"conversational_readiness\": 0.0,\n \"semantic_richness\": 0.0,\n \"context_completeness\": 0.0,\n \"citation_worthiness\": 0.0,\n \"multi_query_coverage\": 0.0\n }},\n \"overall_geo_score\": 0.0,\n \"primary_topics\": [\"topic1\", \"topic2\"],\n \"entities\": [\"entity1\", \"entity2\"],\n \"missing_gaps\": [\"gap1\", \"gap2\"],\n \"optimization_opportunities\": [\n {{\n \"type\": \"semantic_enhancement\",\n \"description\": \"Describe the improvement opportunity\",\n \"priority\": \"high\"\n }}\n ],\n \"recommendations\": [\n \"Write clear and specific suggestions to improve the content\"\n ]\n}}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Quick scoring prompt for faster analysis
27
+ self.quick_score_prompt = "You are an AI Search Optimization Analyst. Evaluate the given content and provide a quick scoring based on key criteria.\nRate each of the following from 1 to 10:\n1. AI Search Visibility\n2. Query Intent Matching\n3. Conversational Readiness\n4. Citation Worthiness\n\n{\n \"scores\": {\n \"ai_search_visibility\": 0.0,\n \"query_intent_matching\": 0.0,\n \"conversational_readiness\": 0.0,\n \"citation_worthiness\": 0.0\n },\n \"overall_score\": 0.0,\n \"top_recommendation\": \"Provide the most critical improvement needed\"\n}"
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Competitive analysis prompt
31
+ self.competitive_prompt = "Compare these content pieces for GEO performance. Identify which performs better for AI search and why.\nContent A: {content_a}\nContent B: {content_b}\nProvide analysis in JSON:\n{\n \"winner\": \"A\" or \"B\",\n \"score_comparison\": {\n \"content_a_score\": 7.5,\n \"content_b_score\": 8.2\n },\n \"key_differences\": [\"difference1\", \"difference2\"],\n \"improvement_suggestions\": {\n \"content_a\": [\"suggestion1\"],\n \"content_b\": [\"suggestion1\"]\n }\n}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def analyze_page_geo(self, content: str, title: str, detailed: bool = True) -> Dict[str, Any]:
34
  """
35
  Analyze a single page for GEO performance
 
36
  """
37
  try:
 
 
 
 
 
 
 
 
 
 
38
  # Choose prompt based on detail level
39
  if detailed:
40
  system_prompt = self.geo_analysis_prompt
41
+ user_message = f"Title: {title}\n\nContent: {content[:8000]}"
42
  else:
43
  system_prompt = self.quick_score_prompt
44
+ user_message = f"Title: {title}\n\nContent: {content[:4000]}"
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Build prompt and run analysis
47
  prompt_template = ChatPromptTemplate.from_messages([
48
  SystemMessagePromptTemplate.from_template(system_prompt),
49
  HumanMessagePromptTemplate.from_template(user_message)
50
  ])
51
+ # ("user", user_message)
52
+ # ("system", system_prompt),
53
  chain = prompt_template | self.llm
54
+ result = chain.invoke({}) # No variables needed
55
 
56
  # Extract and parse result
57
  result_content = result.content if hasattr(result, 'content') else str(result)
 
67
 
68
  return parsed_result
69
 
 
 
 
70
  except Exception as e:
71
+ return {'error': f"GEO analysis failed: {str(e)}"}
 
72
 
73
  def analyze_multiple_pages(self, pages_data: List[Dict[str, Any]], detailed: bool = True) -> List[Dict[str, Any]]:
74
  """
75
+ Analyze multiple pages and return consolidated results
 
 
 
 
 
76
 
77
+ Args:
78
+ pages_data (List[Dict]): List of page data with content and metadata
79
+ detailed (bool): Whether to perform detailed analysis
80
+
81
+ Returns:
82
+ List[Dict]: List of GEO analysis results
83
+ """
84
  results = []
 
 
 
85
 
86
  for i, page_data in enumerate(pages_data):
87
  try:
88
+ content = page_data.get('content', '')
89
+ title = page_data.get('title', f'Page {i+1}')
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  analysis = self.analyze_page_geo(content, title, detailed)
92
 
93
  # Add page-specific metadata
94
  analysis.update({
95
+ 'page_url': page_data.get('url', ''),
96
  'page_index': i,
97
+ 'source_word_count': page_data.get('word_count', 0)
98
  })
99
 
 
 
 
100
  results.append(analysis)
101
 
102
  except Exception as e:
 
103
  results.append({
104
  'page_index': i,
105
+ 'page_url': page_data.get('url', ''),
106
+ 'error': f"Analysis failed: {str(e)}"
107
  })
108
 
 
109
  return results
110
 
111
  def compare_content_geo(self, content_a: str, content_b: str, titles: tuple = None) -> Dict[str, Any]:
112
  """
113
  Compare two pieces of content for GEO performance
114
+
115
+ Args:
116
+ content_a (str): First content to compare
117
+ content_b (str): Second content to compare
118
+ titles (tuple): Optional titles for the content pieces
119
+
120
+ Returns:
121
+ Dict: Comparison analysis results
122
  """
123
  try:
124
  title_a, title_b = titles if titles else ("Content A", "Content B")
125
 
126
+ prompt_template = ChatPromptTemplate.from_messages([
127
+ ("system", self.competitive_prompt),
128
+ ("user", "")
129
+ ])
130
 
131
  # Format the competitive analysis prompt
132
  formatted_prompt = self.competitive_prompt.format(
 
145
  return self._parse_llm_response(result_content)
146
 
147
  except Exception as e:
148
+ return {'error': f"Comparison analysis failed: {str(e)}"}
 
149
 
150
  def calculate_aggregate_scores(self, individual_results: List[Dict[str, Any]]) -> Dict[str, Any]:
151
  """
152
  Calculate aggregate GEO scores from multiple page analyses
153
+
154
+ Args:
155
+ individual_results (List[Dict]): List of individual page analysis results
156
+
157
+ Returns:
158
+ Dict: Aggregate scores and insights
159
  """
160
  try:
161
  valid_results = [r for r in individual_results if 'geo_scores' in r and not r.get('error')]
 
162
 
163
  if not valid_results:
164
+ return {'error': 'No valid results to aggregate'}
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  # Calculate average scores
167
  score_keys = list(valid_results[0]['geo_scores'].keys())
 
185
  all_topics.extend(result.get('primary_topics', []))
186
  all_entities.extend(result.get('entities', []))
187
 
188
+ # Remove duplicates and prioritize
189
  unique_recommendations = list(set(all_recommendations))
190
  unique_topics = list(set(all_topics))
191
  unique_entities = list(set(all_entities))
 
198
  'aggregate_scores': avg_scores,
199
  'overall_score': overall_avg,
200
  'pages_analyzed': len(valid_results),
 
 
201
  'best_performing_metric': {
202
  'metric': best_score[0],
203
  'score': best_score[1]
 
211
  'all_entities': unique_entities,
212
  'high_priority_opportunities': [
213
  opp for opp in all_opportunities
214
+ if opp.get('priority') == 'high'
215
  ][:5],
216
  'score_distribution': self._calculate_score_distribution(avg_scores)
217
  }
218
 
219
  except Exception as e:
220
+ return {'error': f"Aggregation failed: {str(e)}"}
 
221
 
222
  def generate_geo_report(self, analysis_results: Dict[str, Any], website_url: str = None) -> Dict[str, Any]:
223
  """
224
  Generate a comprehensive GEO report
225
+
226
+ Args:
227
+ analysis_results (Dict): Results from aggregate analysis
228
+ website_url (str): Optional website URL for context
229
+
230
+ Returns:
231
+ Dict: Comprehensive GEO report
232
  """
233
  try:
234
  report = {
 
255
  return report
256
 
257
  except Exception as e:
258
+ return {'error': f"Report generation failed: {str(e)}"}
 
259
 
260
  def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
261
+ """Parse LLM response and extract JSON content"""
262
  try:
263
+ # Find JSON content in the response
264
+ json_start = response_text.find('{')
265
+ json_end = response_text.rfind('}') + 1
 
 
 
 
 
 
266
 
267
+ if json_start != -1 and json_end != -1:
268
+ json_str = response_text[json_start:json_end]
269
+ return json.loads(json_str)
270
+ else:
271
+ # If no JSON found, return the raw response
272
+ return {'raw_response': response_text, 'parsing_error': 'No JSON found'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ except json.JSONDecodeError as e:
275
+ return {'raw_response': response_text, 'parsing_error': f'JSON decode error: {str(e)}'}
276
  except Exception as e:
277
+ return {'raw_response': response_text, 'parsing_error': f'Unexpected error: {str(e)}'}
 
 
 
 
278
 
279
  def _calculate_score_distribution(self, scores: Dict[str, float]) -> Dict[str, Any]:
280
  """Calculate distribution of scores for insights"""
 
401
  'position': position,
402
  'description': description,
403
  'score': overall_score,
404
+ 'percentile_estimate': min(overall_score * 10, 100) # Rough percentile estimate
405
  }
406
 
407
  def _get_timestamp(self) -> str:
408
  """Get current timestamp"""
409
+ from datetime import datetime
410
+ return datetime.now().strftime('%Y-%m-%d %H:%M:%S')