NewsLetter / utils /data_validator.py
SmartHeal's picture
Upload 19 files
a19173c verified
import re
import logging
from typing import Dict, List, Optional, Any
from datetime import datetime
class DataValidator:
"""Data validation and fact-checking utilities"""
def __init__(self):
self.credible_domains = [
'edu', 'gov', 'org', 'reuters.com', 'bloomberg.com',
'wsj.com', 'ft.com', 'nature.com', 'science.org',
'who.int', 'cdc.gov', 'fda.gov', 'sec.gov'
]
def validate_research_data(self, search_results: Dict) -> Dict:
"""Validate and clean research data"""
validated_data = {
'verified_content': [],
'statistics': [],
'credible_sources': [],
'quality_score': 0
}
# Validate Google results
for item in search_results.get('google_results', []):
if self._is_credible_source(item.get('link', '')):
validated_data['verified_content'].append({
'title': item.get('title', ''),
'content': item.get('snippet', ''),
'source': item.get('displayLink', ''),
'url': item.get('link', ''),
'credibility': 'high'
})
# Extract and validate statistics from scraped content
for content_item in search_results.get('scraped_content', []):
stats = self.extract_statistics(content_item.get('content', ''))
validated_data['statistics'].extend(stats)
# Also extract from google results snippets
for item in search_results.get('google_results', []):
snippet_stats = self.extract_statistics(item.get('snippet', ''))
validated_data['statistics'].extend(snippet_stats)
# Calculate quality score
validated_data['quality_score'] = self._calculate_quality_score(validated_data)
return validated_data
def extract_metrics(self, validated_data: Dict) -> List[Dict]:
"""Extract key metrics from validated data"""
metrics = []
# Extract from statistics
for stat in validated_data.get('statistics', []):
if stat.get('value') and stat.get('type'):
metrics.append({
'metric': stat['value'],
'type': stat['type'],
'context': stat.get('context', ''),
'confidence': stat.get('confidence', 0.5)
})
# Extract from content
for content in validated_data.get('verified_content', []):
content_metrics = self._extract_metrics_from_text(content.get('content', ''))
metrics.extend(content_metrics)
# Extract from scraped content as well
for content in validated_data.get('scraped_content', []):
content_metrics = self._extract_metrics_from_text(content.get('content', ''))
metrics.extend(content_metrics)
# Sort by confidence and return top metrics
metrics.sort(key=lambda x: x.get('confidence', 0), reverse=True)
return metrics[:10]
def extract_statistics(self, text: str) -> List[Dict]:
"""Extract statistical data from text"""
statistics = []
# Patterns for different types of statistics
patterns = {
'percentage': r'(\d+(?:\.\d+)?)\s*%',
'currency': r'\$(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*(billion|million|trillion)?',
'growth': r'(\d+(?:\.\d+)?)\s*(times|fold|x)\s*(?:increase|growth|rise)',
'large_numbers': r'(\d{1,3}(?:,\d{3})*)\s*(billion|million|thousand)',
'ratios': r'(\d+(?:\.\d+)?):\s*(\d+(?:\.\d+)?)',
'years': r'(20\d{2})',
'quantities': r'(\d+(?:,\d{3})*)\s*(units|people|companies|users|customers)'
}
for stat_type, pattern in patterns.items():
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
statistic = {
'value': match.group(0),
'type': stat_type,
'context': self._extract_context(text, match.start(), match.end()),
'confidence': self._calculate_stat_confidence(match.group(0), stat_type)
}
statistics.append(statistic)
return statistics[:5] # Return top 5 statistics
def calculate_credibility_score(self, search_results: Dict) -> float:
"""Calculate overall credibility score for research results"""
total_sources = len(search_results.get('sources', []))
if total_sources == 0:
return 0.0
credible_count = 0
for source in search_results.get('sources', []):
if self._is_credible_source(source):
credible_count += 1
# Base credibility on source quality
base_score = (credible_count / total_sources) * 10
# Adjust for content quality
content_items = search_results.get('scraped_content', [])
if content_items:
avg_content_length = sum(len(item.get('content', '')) for item in content_items) / len(content_items)
content_bonus = min(avg_content_length / 1000, 2.0) # Up to 2 point bonus
base_score += content_bonus
return min(base_score, 10.0) # Cap at 10
def _is_credible_source(self, url: str) -> bool:
"""Check if URL is from a credible source"""
if not url:
return False
url_lower = url.lower()
return any(domain in url_lower for domain in self.credible_domains)
def _calculate_quality_score(self, validated_data: Dict) -> float:
"""Calculate overall data quality score"""
score = 0.0
# Points for verified content
verified_count = len(validated_data.get('verified_content', []))
score += min(verified_count * 1.5, 5.0) # Up to 5 points
# Points for statistics
stats_count = len(validated_data.get('statistics', []))
score += min(stats_count * 0.5, 3.0) # Up to 3 points
# Points for credible sources
credible_count = len(validated_data.get('credible_sources', []))
score += min(credible_count * 1.0, 2.0) # Up to 2 points
return min(score, 10.0)
def _extract_metrics_from_text(self, text: str) -> List[Dict]:
"""Extract metrics from text content"""
metrics = []
# Look for key performance indicators
kpi_patterns = [
r'ROI.*?(\d+(?:\.\d+)?%)',
r'revenue.*?(\$\d+(?:,\d{3})*(?:\.\d+)?)',
r'growth.*?(\d+(?:\.\d+)?%)',
r'market share.*?(\d+(?:\.\d+)?%)',
r'efficiency.*?(\d+(?:\.\d+)?%)',
]
for pattern in kpi_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
metric = {
'metric': match.group(1),
'type': 'kpi',
'context': match.group(0),
'confidence': 0.8
}
metrics.append(metric)
return metrics[:3] # Return top 3 metrics
def _extract_context(self, text: str, start: int, end: int, window: int = 50) -> str:
"""Extract context around a statistical match"""
context_start = max(0, start - window)
context_end = min(len(text), end + window)
context = text[context_start:context_end].strip()
return context
def _calculate_stat_confidence(self, value: str, stat_type: str) -> float:
"""Calculate confidence score for a statistic"""
confidence = 0.5 # Base confidence
# Higher confidence for certain types
if stat_type in ['percentage', 'currency']:
confidence += 0.3
# Lower confidence for very round numbers (might be estimates)
if re.match(r'\d+0+', value.replace(',', '').replace('.', '').replace('%', '')):
confidence -= 0.2
return max(0.1, min(1.0, confidence))
def fact_check_claim(self, claim: str, context: Dict) -> Dict:
"""Basic fact-checking for claims (placeholder for advanced implementation)"""
return {
'claim': claim,
'verification_status': 'requires_manual_review',
'confidence': 0.5,
'supporting_sources': [],
'contradicting_sources': []
}