File size: 12,835 Bytes
369da03
5de8c42
 
369da03
 
5de8c42
 
369da03
5de8c42
 
369da03
5de8c42
 
369da03
5de8c42
369da03
5de8c42
 
 
 
 
 
 
369da03
 
5de8c42
 
 
 
 
 
 
369da03
5de8c42
 
 
 
 
 
 
 
 
369da03
5de8c42
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
369da03
5de8c42
 
 
 
 
 
 
 
 
 
 
 
 
369da03
5de8c42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
 
 
 
 
 
 
369da03
5de8c42
 
 
 
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369da03
5de8c42
 
 
 
 
369da03
5de8c42
369da03
 
5de8c42
 
369da03
 
5de8c42
369da03
5de8c42
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
 
 
 
369da03
5de8c42
 
369da03
5de8c42
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
 
 
 
 
 
 
 
 
369da03
5de8c42
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
369da03
5de8c42
 
 
 
 
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369da03
 
5de8c42
 
 
 
 
 
 
369da03
5de8c42
 
 
369da03
5de8c42
 
369da03
5de8c42
369da03
5de8c42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369da03
5de8c42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369da03
5de8c42
369da03
5de8c42
 
 
 
 
 
369da03
5de8c42
 
369da03
5de8c42
 
 
 
 
 
 
369da03
5de8c42
 
 
369da03
5de8c42
 
 
 
 
 
369da03
5de8c42
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
"""
GEO Scorer Data Integration Fix
Handles various data formats from web scrapers and ensures compatibility
"""

import logging
from typing import Dict, Any, List, Union, Optional

class GEODataAdapter:
    """Adapter to handle different data formats from web scrapers"""
    
    def __init__(self, logger: Optional[logging.Logger] = None):
        self.logger = logger or logging.getLogger(__name__)
    
    def normalize_scraped_data(self, scraped_data: Union[Dict, List]) -> List[Dict[str, Any]]:
        """
        Normalize scraped data to the format expected by GEOScorer
        
        Args:
            scraped_data: Raw data from web scraper (various formats)
            
        Returns:
            List[Dict]: Normalized data ready for GEO analysis
        """
        try:
            # Handle different input formats
            if isinstance(scraped_data, dict):
                # Single page data
                normalized = [self._normalize_single_page(scraped_data)]
            elif isinstance(scraped_data, list):
                # Multiple pages
                normalized = [self._normalize_single_page(page) for page in scraped_data]
            else:
                raise ValueError(f"Unsupported data type: {type(scraped_data)}")
            
            # Filter out invalid entries
            valid_pages = [page for page in normalized if page.get('content')]
            
            self.logger.info(f"Normalized {len(valid_pages)} valid pages from {len(normalized) if isinstance(normalized, list) else 1} total")
            
            return valid_pages
            
        except Exception as e:
            self.logger.error(f"Data normalization failed: {e}")
            return []
    
    def _normalize_single_page(self, page_data: Dict[str, Any]) -> Dict[str, Any]:
        """Normalize a single page's data structure"""
        
        # Common field mappings from different scrapers
        content_fields = ['content', 'text', 'body', 'html_content', 'page_content', 'main_content']
        title_fields = ['title', 'page_title', 'heading', 'h1', 'name']
        url_fields = ['url', 'link', 'page_url', 'source_url', 'href']
        
        # Extract content (try multiple possible field names)
        content = ""
        for field in content_fields:
            if field in page_data and page_data[field]:
                content = str(page_data[field])
                break
        
        # Extract title
        title = "Untitled Page"
        for field in title_fields:
            if field in page_data and page_data[field]:
                title = str(page_data[field])
                break
        
        # Extract URL
        url = ""
        for field in url_fields:
            if field in page_data and page_data[field]:
                url = str(page_data[field])
                break
        
        # Create normalized structure
        normalized = {
            'content': content,
            'title': title,
            'url': url,
            'word_count': len(content.split()) if content else 0,
            'original_data': page_data  # Keep original for debugging
        }
        
        # Add any additional metadata
        metadata_fields = ['description', 'keywords', 'author', 'date', 'meta_description']
        for field in metadata_fields:
            if field in page_data:
                normalized[field] = page_data[field]
        
        return normalized
    
    def validate_normalized_data(self, normalized_data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Validate normalized data and provide diagnostics"""
        
        validation_results = {
            'total_pages': len(normalized_data),
            'valid_pages': 0,
            'invalid_pages': 0,
            'issues': [],
            'summary': {}
        }
        
        for i, page in enumerate(normalized_data):
            issues = []
            
            # Check required fields
            if not page.get('content'):
                issues.append(f"Page {i}: Missing or empty content")
            elif len(page['content'].strip()) < 50:
                issues.append(f"Page {i}: Content too short ({len(page['content'])} chars)")
            
            if not page.get('title'):
                issues.append(f"Page {i}: Missing title")
            
            if issues:
                validation_results['invalid_pages'] += 1
                validation_results['issues'].extend(issues)
            else:
                validation_results['valid_pages'] += 1
        
        # Generate summary
        content_lengths = [len(page.get('content', '')) for page in normalized_data if page.get('content')]
        if content_lengths:
            validation_results['summary'] = {
                'avg_content_length': sum(content_lengths) / len(content_lengths),
                'min_content_length': min(content_lengths),
                'max_content_length': max(content_lengths),
                'pages_with_titles': len([p for p in normalized_data if p.get('title') and p['title'] != 'Untitled Page']),
                'pages_with_urls': len([p for p in normalized_data if p.get('url')])
            }
        
        return validation_results


class GEOScorerWithAdapter(GEOScorer):
    """Extended GEOScorer with built-in data adaptation"""
    
    def __init__(self, llm, config: Optional[GEOConfig] = None, logger: Optional[logging.Logger] = None):
        super().__init__(llm, config, logger)
        self.data_adapter = GEODataAdapter(logger)
    
    def analyze_scraped_data(self, scraped_data: Union[Dict, List], detailed: bool = True) -> Dict[str, Any]:
        """
        Analyze scraped data with automatic format detection and normalization
        
        Args:
            scraped_data: Raw scraped data in any format
            detailed: Whether to perform detailed analysis
            
        Returns:
            Dict: Complete analysis results with diagnostics
        """
        self.logger.info("Starting analysis of scraped data")
        
        try:
            # Step 1: Normalize the data
            normalized_data = self.data_adapter.normalize_scraped_data(scraped_data)
            
            if not normalized_data:
                return {
                    'error': 'No valid data could be extracted from scraped content',
                    'error_type': 'data_normalization',
                    'original_data_type': str(type(scraped_data)),
                    'original_data_sample': str(scraped_data)[:200] if scraped_data else None
                }
            
            # Step 2: Validate normalized data
            validation_results = self.data_adapter.validate_normalized_data(normalized_data)
            
            # Step 3: Analyze valid pages
            analysis_results = self.analyze_multiple_pages(normalized_data, detailed)
            
            # Step 4: Calculate aggregate scores
            aggregate_results = self.calculate_aggregate_scores(analysis_results)
            
            # Step 5: Combine all results
            complete_results = {
                'data_validation': validation_results,
                'individual_analyses': analysis_results,
                'aggregate_scores': aggregate_results,
                'processing_summary': {
                    'pages_scraped': validation_results['total_pages'],
                    'pages_analyzed': len([r for r in analysis_results if not r.get('error')]),
                    'overall_success_rate': validation_results['valid_pages'] / max(validation_results['total_pages'], 1),
                    'analysis_type': 'detailed' if detailed else 'quick'
                }
            }
            
            self.logger.info(f"Analysis completed: {complete_results['processing_summary']}")
            return complete_results
            
        except Exception as e:
            self.logger.error(f"Scraped data analysis failed: {e}")
            return {
                'error': f'Analysis failed: {str(e)}',
                'error_type': 'system',
                'original_data_type': str(type(scraped_data)),
                'traceback': str(e)
            }


# Debugging utility functions
def debug_scraped_data(scraped_data: Union[Dict, List]) -> Dict[str, Any]:
    """
    Debug utility to understand the structure of scraped data
    
    Args:
        scraped_data: The raw scraped data causing issues
        
    Returns:
        Dict: Detailed breakdown of the data structure
    """
    debug_info = {
        'data_type': str(type(scraped_data)),
        'data_structure': {},
        'sample_content': {},
        'recommendations': []
    }
    
    try:
        if isinstance(scraped_data, dict):
            debug_info['data_structure'] = {
                'is_dict': True,
                'keys': list(scraped_data.keys()),
                'key_count': len(scraped_data.keys())
            }
            
            # Sample first few key-value pairs
            for i, (key, value) in enumerate(list(scraped_data.items())[:5]):
                debug_info['sample_content'][key] = {
                    'type': str(type(value)),
                    'length': len(str(value)) if value else 0,
                    'sample': str(value)[:100] if value else None
                }
            
            # Check for common content fields
            content_fields = ['content', 'text', 'body', 'html_content', 'page_content']
            found_content_fields = [field for field in content_fields if field in scraped_data]
            
            if found_content_fields:
                debug_info['recommendations'].append(f"Found potential content fields: {found_content_fields}")
            else:
                debug_info['recommendations'].append("No standard content fields found. Check field names.")
                
        elif isinstance(scraped_data, list):
            debug_info['data_structure'] = {
                'is_list': True,
                'length': len(scraped_data),
                'first_item_type': str(type(scraped_data[0])) if scraped_data else 'empty'
            }
            
            if scraped_data and isinstance(scraped_data[0], dict):
                debug_info['sample_content']['first_item_keys'] = list(scraped_data[0].keys())
                
        else:
            debug_info['recommendations'].append(f"Unexpected data type: {type(scraped_data)}")
            
    except Exception as e:
        debug_info['error'] = f"Debug analysis failed: {str(e)}"
    
    return debug_info


def create_test_scraped_data() -> List[Dict[str, Any]]:
    """Create test data in various formats that scrapers might return"""
    
    # Format 1: Standard format
    format1 = {
        'content': 'This is the main content of the page about AI optimization.',
        'title': 'AI Optimization Guide',
        'url': 'https://example.com/ai-guide'
    }
    
    # Format 2: Different field names
    format2 = {
        'text': 'Content about machine learning best practices.',
        'page_title': 'ML Best Practices',
        'link': 'https://example.com/ml-practices'
    }
    
    # Format 3: Nested structure
    format3 = {
        'page_data': {
            'body': 'Deep learning techniques for content optimization.',
            'heading': 'Deep Learning Guide'
        },
        'metadata': {
            'source_url': 'https://example.com/deep-learning'
        }
    }
    
    return [format1, format2, format3]


# Usage example and testing
def test_data_integration():
    """Test the data integration fixes"""
    
    # Test with various data formats
    test_data = create_test_scraped_data()
    
    # Debug the data first
    for i, data in enumerate(test_data):
        print(f"\n--- Debug Info for Test Data {i+1} ---")
        debug_info = debug_scraped_data(data)
        print(f"Data type: {debug_info['data_type']}")
        print(f"Keys: {debug_info['data_structure'].get('keys', 'N/A')}")
        print(f"Recommendations: {debug_info['recommendations']}")
    
    # Test normalization
    adapter = GEODataAdapter()
    normalized = adapter.normalize_scraped_data(test_data)
    
    print(f"\n--- Normalization Results ---")
    print(f"Original items: {len(test_data)}")
    print(f"Normalized items: {len(normalized)}")
    
    for i, item in enumerate(normalized):
        print(f"Item {i+1}: Title='{item['title']}', Content length={len(item['content'])}")
    
    # Test validation
    validation = adapter.validate_normalized_data(normalized)
    print(f"\n--- Validation Results ---")
    print(f"Valid pages: {validation['valid_pages']}/{validation['total_pages']}")
    print(f"Issues: {validation['issues']}")


if __name__ == "__main__":
    test_data_integration()