File size: 8,755 Bytes
a19173c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import re
import logging
from typing import Dict, List, Optional, Any
from datetime import datetime

class DataValidator:
    """Data validation and fact-checking utilities"""
    
    def __init__(self):
        self.credible_domains = [
            'edu', 'gov', 'org', 'reuters.com', 'bloomberg.com', 
            'wsj.com', 'ft.com', 'nature.com', 'science.org',
            'who.int', 'cdc.gov', 'fda.gov', 'sec.gov'
        ]
    
    def validate_research_data(self, search_results: Dict) -> Dict:
        """Validate and clean research data"""
        
        validated_data = {
            'verified_content': [],
            'statistics': [],
            'credible_sources': [],
            'quality_score': 0
        }
        
        # Validate Google results
        for item in search_results.get('google_results', []):
            if self._is_credible_source(item.get('link', '')):
                validated_data['verified_content'].append({
                    'title': item.get('title', ''),
                    'content': item.get('snippet', ''),
                    'source': item.get('displayLink', ''),
                    'url': item.get('link', ''),
                    'credibility': 'high'
                })
        
        # Extract and validate statistics from scraped content
        for content_item in search_results.get('scraped_content', []):
            stats = self.extract_statistics(content_item.get('content', ''))
            validated_data['statistics'].extend(stats)
        
        # Also extract from google results snippets
        for item in search_results.get('google_results', []):
            snippet_stats = self.extract_statistics(item.get('snippet', ''))
            validated_data['statistics'].extend(snippet_stats)
        
        # Calculate quality score
        validated_data['quality_score'] = self._calculate_quality_score(validated_data)
        
        return validated_data
    
    def extract_metrics(self, validated_data: Dict) -> List[Dict]:
        """Extract key metrics from validated data"""
        
        metrics = []
        
        # Extract from statistics
        for stat in validated_data.get('statistics', []):
            if stat.get('value') and stat.get('type'):
                metrics.append({
                    'metric': stat['value'],
                    'type': stat['type'],
                    'context': stat.get('context', ''),
                    'confidence': stat.get('confidence', 0.5)
                })
        
        # Extract from content
        for content in validated_data.get('verified_content', []):
            content_metrics = self._extract_metrics_from_text(content.get('content', ''))
            metrics.extend(content_metrics)
        
        # Extract from scraped content as well
        for content in validated_data.get('scraped_content', []):
            content_metrics = self._extract_metrics_from_text(content.get('content', ''))
            metrics.extend(content_metrics)
        
        # Sort by confidence and return top metrics
        metrics.sort(key=lambda x: x.get('confidence', 0), reverse=True)
        return metrics[:10]
    
    def extract_statistics(self, text: str) -> List[Dict]:
        """Extract statistical data from text"""
        
        statistics = []
        
        # Patterns for different types of statistics
        patterns = {
            'percentage': r'(\d+(?:\.\d+)?)\s*%',
            'currency': r'\$(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*(billion|million|trillion)?',
            'growth': r'(\d+(?:\.\d+)?)\s*(times|fold|x)\s*(?:increase|growth|rise)',
            'large_numbers': r'(\d{1,3}(?:,\d{3})*)\s*(billion|million|thousand)',
            'ratios': r'(\d+(?:\.\d+)?):\s*(\d+(?:\.\d+)?)',
            'years': r'(20\d{2})',
            'quantities': r'(\d+(?:,\d{3})*)\s*(units|people|companies|users|customers)'
        }
        
        for stat_type, pattern in patterns.items():
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                statistic = {
                    'value': match.group(0),
                    'type': stat_type,
                    'context': self._extract_context(text, match.start(), match.end()),
                    'confidence': self._calculate_stat_confidence(match.group(0), stat_type)
                }
                statistics.append(statistic)
        
        return statistics[:5]  # Return top 5 statistics
    
    def calculate_credibility_score(self, search_results: Dict) -> float:
        """Calculate overall credibility score for research results"""
        
        total_sources = len(search_results.get('sources', []))
        if total_sources == 0:
            return 0.0
        
        credible_count = 0
        for source in search_results.get('sources', []):
            if self._is_credible_source(source):
                credible_count += 1
        
        # Base credibility on source quality
        base_score = (credible_count / total_sources) * 10
        
        # Adjust for content quality
        content_items = search_results.get('scraped_content', [])
        if content_items:
            avg_content_length = sum(len(item.get('content', '')) for item in content_items) / len(content_items)
            content_bonus = min(avg_content_length / 1000, 2.0)  # Up to 2 point bonus
            base_score += content_bonus
        
        return min(base_score, 10.0)  # Cap at 10
    
    def _is_credible_source(self, url: str) -> bool:
        """Check if URL is from a credible source"""
        if not url:
            return False
        
        url_lower = url.lower()
        return any(domain in url_lower for domain in self.credible_domains)
    
    def _calculate_quality_score(self, validated_data: Dict) -> float:
        """Calculate overall data quality score"""
        
        score = 0.0
        
        # Points for verified content
        verified_count = len(validated_data.get('verified_content', []))
        score += min(verified_count * 1.5, 5.0)  # Up to 5 points
        
        # Points for statistics
        stats_count = len(validated_data.get('statistics', []))
        score += min(stats_count * 0.5, 3.0)  # Up to 3 points
        
        # Points for credible sources
        credible_count = len(validated_data.get('credible_sources', []))
        score += min(credible_count * 1.0, 2.0)  # Up to 2 points
        
        return min(score, 10.0)
    
    def _extract_metrics_from_text(self, text: str) -> List[Dict]:
        """Extract metrics from text content"""
        
        metrics = []
        
        # Look for key performance indicators
        kpi_patterns = [
            r'ROI.*?(\d+(?:\.\d+)?%)',
            r'revenue.*?(\$\d+(?:,\d{3})*(?:\.\d+)?)',
            r'growth.*?(\d+(?:\.\d+)?%)',
            r'market share.*?(\d+(?:\.\d+)?%)',
            r'efficiency.*?(\d+(?:\.\d+)?%)',
        ]
        
        for pattern in kpi_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                metric = {
                    'metric': match.group(1),
                    'type': 'kpi',
                    'context': match.group(0),
                    'confidence': 0.8
                }
                metrics.append(metric)
        
        return metrics[:3]  # Return top 3 metrics
    
    def _extract_context(self, text: str, start: int, end: int, window: int = 50) -> str:
        """Extract context around a statistical match"""
        
        context_start = max(0, start - window)
        context_end = min(len(text), end + window)
        
        context = text[context_start:context_end].strip()
        return context
    
    def _calculate_stat_confidence(self, value: str, stat_type: str) -> float:
        """Calculate confidence score for a statistic"""
        
        confidence = 0.5  # Base confidence
        
        # Higher confidence for certain types
        if stat_type in ['percentage', 'currency']:
            confidence += 0.3
        
        # Lower confidence for very round numbers (might be estimates)
        if re.match(r'\d+0+', value.replace(',', '').replace('.', '').replace('%', '')):
            confidence -= 0.2
        
        return max(0.1, min(1.0, confidence))
    
    def fact_check_claim(self, claim: str, context: Dict) -> Dict:
        """Basic fact-checking for claims (placeholder for advanced implementation)"""
        
        return {
            'claim': claim,
            'verification_status': 'requires_manual_review',
            'confidence': 0.5,
            'supporting_sources': [],
            'contradicting_sources': []
        }