import re import logging from typing import Dict, List, Optional, Any from datetime import datetime class DataValidator: """Data validation and fact-checking utilities""" def __init__(self): self.credible_domains = [ 'edu', 'gov', 'org', 'reuters.com', 'bloomberg.com', 'wsj.com', 'ft.com', 'nature.com', 'science.org', 'who.int', 'cdc.gov', 'fda.gov', 'sec.gov' ] def validate_research_data(self, search_results: Dict) -> Dict: """Validate and clean research data""" validated_data = { 'verified_content': [], 'statistics': [], 'credible_sources': [], 'quality_score': 0 } # Validate Google results for item in search_results.get('google_results', []): if self._is_credible_source(item.get('link', '')): validated_data['verified_content'].append({ 'title': item.get('title', ''), 'content': item.get('snippet', ''), 'source': item.get('displayLink', ''), 'url': item.get('link', ''), 'credibility': 'high' }) # Extract and validate statistics from scraped content for content_item in search_results.get('scraped_content', []): stats = self.extract_statistics(content_item.get('content', '')) validated_data['statistics'].extend(stats) # Also extract from google results snippets for item in search_results.get('google_results', []): snippet_stats = self.extract_statistics(item.get('snippet', '')) validated_data['statistics'].extend(snippet_stats) # Calculate quality score validated_data['quality_score'] = self._calculate_quality_score(validated_data) return validated_data def extract_metrics(self, validated_data: Dict) -> List[Dict]: """Extract key metrics from validated data""" metrics = [] # Extract from statistics for stat in validated_data.get('statistics', []): if stat.get('value') and stat.get('type'): metrics.append({ 'metric': stat['value'], 'type': stat['type'], 'context': stat.get('context', ''), 'confidence': stat.get('confidence', 0.5) }) # Extract from content for content in validated_data.get('verified_content', []): content_metrics = self._extract_metrics_from_text(content.get('content', '')) metrics.extend(content_metrics) # Extract from scraped content as well for content in validated_data.get('scraped_content', []): content_metrics = self._extract_metrics_from_text(content.get('content', '')) metrics.extend(content_metrics) # Sort by confidence and return top metrics metrics.sort(key=lambda x: x.get('confidence', 0), reverse=True) return metrics[:10] def extract_statistics(self, text: str) -> List[Dict]: """Extract statistical data from text""" statistics = [] # Patterns for different types of statistics patterns = { 'percentage': r'(\d+(?:\.\d+)?)\s*%', 'currency': r'\$(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*(billion|million|trillion)?', 'growth': r'(\d+(?:\.\d+)?)\s*(times|fold|x)\s*(?:increase|growth|rise)', 'large_numbers': r'(\d{1,3}(?:,\d{3})*)\s*(billion|million|thousand)', 'ratios': r'(\d+(?:\.\d+)?):\s*(\d+(?:\.\d+)?)', 'years': r'(20\d{2})', 'quantities': r'(\d+(?:,\d{3})*)\s*(units|people|companies|users|customers)' } for stat_type, pattern in patterns.items(): matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: statistic = { 'value': match.group(0), 'type': stat_type, 'context': self._extract_context(text, match.start(), match.end()), 'confidence': self._calculate_stat_confidence(match.group(0), stat_type) } statistics.append(statistic) return statistics[:5] # Return top 5 statistics def calculate_credibility_score(self, search_results: Dict) -> float: """Calculate overall credibility score for research results""" total_sources = len(search_results.get('sources', [])) if total_sources == 0: return 0.0 credible_count = 0 for source in search_results.get('sources', []): if self._is_credible_source(source): credible_count += 1 # Base credibility on source quality base_score = (credible_count / total_sources) * 10 # Adjust for content quality content_items = search_results.get('scraped_content', []) if content_items: avg_content_length = sum(len(item.get('content', '')) for item in content_items) / len(content_items) content_bonus = min(avg_content_length / 1000, 2.0) # Up to 2 point bonus base_score += content_bonus return min(base_score, 10.0) # Cap at 10 def _is_credible_source(self, url: str) -> bool: """Check if URL is from a credible source""" if not url: return False url_lower = url.lower() return any(domain in url_lower for domain in self.credible_domains) def _calculate_quality_score(self, validated_data: Dict) -> float: """Calculate overall data quality score""" score = 0.0 # Points for verified content verified_count = len(validated_data.get('verified_content', [])) score += min(verified_count * 1.5, 5.0) # Up to 5 points # Points for statistics stats_count = len(validated_data.get('statistics', [])) score += min(stats_count * 0.5, 3.0) # Up to 3 points # Points for credible sources credible_count = len(validated_data.get('credible_sources', [])) score += min(credible_count * 1.0, 2.0) # Up to 2 points return min(score, 10.0) def _extract_metrics_from_text(self, text: str) -> List[Dict]: """Extract metrics from text content""" metrics = [] # Look for key performance indicators kpi_patterns = [ r'ROI.*?(\d+(?:\.\d+)?%)', r'revenue.*?(\$\d+(?:,\d{3})*(?:\.\d+)?)', r'growth.*?(\d+(?:\.\d+)?%)', r'market share.*?(\d+(?:\.\d+)?%)', r'efficiency.*?(\d+(?:\.\d+)?%)', ] for pattern in kpi_patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: metric = { 'metric': match.group(1), 'type': 'kpi', 'context': match.group(0), 'confidence': 0.8 } metrics.append(metric) return metrics[:3] # Return top 3 metrics def _extract_context(self, text: str, start: int, end: int, window: int = 50) -> str: """Extract context around a statistical match""" context_start = max(0, start - window) context_end = min(len(text), end + window) context = text[context_start:context_end].strip() return context def _calculate_stat_confidence(self, value: str, stat_type: str) -> float: """Calculate confidence score for a statistic""" confidence = 0.5 # Base confidence # Higher confidence for certain types if stat_type in ['percentage', 'currency']: confidence += 0.3 # Lower confidence for very round numbers (might be estimates) if re.match(r'\d+0+', value.replace(',', '').replace('.', '').replace('%', '')): confidence -= 0.2 return max(0.1, min(1.0, confidence)) def fact_check_claim(self, claim: str, context: Dict) -> Dict: """Basic fact-checking for claims (placeholder for advanced implementation)""" return { 'claim': claim, 'verification_status': 'requires_manual_review', 'confidence': 0.5, 'supporting_sources': [], 'contradicting_sources': [] }