Spaces:
Sleeping
Sleeping
| import re | |
| import logging | |
| from typing import Dict, List, Optional, Any | |
| from datetime import datetime | |
| class DataValidator: | |
| """Data validation and fact-checking utilities""" | |
| def __init__(self): | |
| self.credible_domains = [ | |
| 'edu', 'gov', 'org', 'reuters.com', 'bloomberg.com', | |
| 'wsj.com', 'ft.com', 'nature.com', 'science.org', | |
| 'who.int', 'cdc.gov', 'fda.gov', 'sec.gov' | |
| ] | |
| def validate_research_data(self, search_results: Dict) -> Dict: | |
| """Validate and clean research data""" | |
| validated_data = { | |
| 'verified_content': [], | |
| 'statistics': [], | |
| 'credible_sources': [], | |
| 'quality_score': 0 | |
| } | |
| # Validate Google results | |
| for item in search_results.get('google_results', []): | |
| if self._is_credible_source(item.get('link', '')): | |
| validated_data['verified_content'].append({ | |
| 'title': item.get('title', ''), | |
| 'content': item.get('snippet', ''), | |
| 'source': item.get('displayLink', ''), | |
| 'url': item.get('link', ''), | |
| 'credibility': 'high' | |
| }) | |
| # Extract and validate statistics from scraped content | |
| for content_item in search_results.get('scraped_content', []): | |
| stats = self.extract_statistics(content_item.get('content', '')) | |
| validated_data['statistics'].extend(stats) | |
| # Also extract from google results snippets | |
| for item in search_results.get('google_results', []): | |
| snippet_stats = self.extract_statistics(item.get('snippet', '')) | |
| validated_data['statistics'].extend(snippet_stats) | |
| # Calculate quality score | |
| validated_data['quality_score'] = self._calculate_quality_score(validated_data) | |
| return validated_data | |
| def extract_metrics(self, validated_data: Dict) -> List[Dict]: | |
| """Extract key metrics from validated data""" | |
| metrics = [] | |
| # Extract from statistics | |
| for stat in validated_data.get('statistics', []): | |
| if stat.get('value') and stat.get('type'): | |
| metrics.append({ | |
| 'metric': stat['value'], | |
| 'type': stat['type'], | |
| 'context': stat.get('context', ''), | |
| 'confidence': stat.get('confidence', 0.5) | |
| }) | |
| # Extract from content | |
| for content in validated_data.get('verified_content', []): | |
| content_metrics = self._extract_metrics_from_text(content.get('content', '')) | |
| metrics.extend(content_metrics) | |
| # Extract from scraped content as well | |
| for content in validated_data.get('scraped_content', []): | |
| content_metrics = self._extract_metrics_from_text(content.get('content', '')) | |
| metrics.extend(content_metrics) | |
| # Sort by confidence and return top metrics | |
| metrics.sort(key=lambda x: x.get('confidence', 0), reverse=True) | |
| return metrics[:10] | |
| def extract_statistics(self, text: str) -> List[Dict]: | |
| """Extract statistical data from text""" | |
| statistics = [] | |
| # Patterns for different types of statistics | |
| patterns = { | |
| 'percentage': r'(\d+(?:\.\d+)?)\s*%', | |
| 'currency': r'\$(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*(billion|million|trillion)?', | |
| 'growth': r'(\d+(?:\.\d+)?)\s*(times|fold|x)\s*(?:increase|growth|rise)', | |
| 'large_numbers': r'(\d{1,3}(?:,\d{3})*)\s*(billion|million|thousand)', | |
| 'ratios': r'(\d+(?:\.\d+)?):\s*(\d+(?:\.\d+)?)', | |
| 'years': r'(20\d{2})', | |
| 'quantities': r'(\d+(?:,\d{3})*)\s*(units|people|companies|users|customers)' | |
| } | |
| for stat_type, pattern in patterns.items(): | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| statistic = { | |
| 'value': match.group(0), | |
| 'type': stat_type, | |
| 'context': self._extract_context(text, match.start(), match.end()), | |
| 'confidence': self._calculate_stat_confidence(match.group(0), stat_type) | |
| } | |
| statistics.append(statistic) | |
| return statistics[:5] # Return top 5 statistics | |
| def calculate_credibility_score(self, search_results: Dict) -> float: | |
| """Calculate overall credibility score for research results""" | |
| total_sources = len(search_results.get('sources', [])) | |
| if total_sources == 0: | |
| return 0.0 | |
| credible_count = 0 | |
| for source in search_results.get('sources', []): | |
| if self._is_credible_source(source): | |
| credible_count += 1 | |
| # Base credibility on source quality | |
| base_score = (credible_count / total_sources) * 10 | |
| # Adjust for content quality | |
| content_items = search_results.get('scraped_content', []) | |
| if content_items: | |
| avg_content_length = sum(len(item.get('content', '')) for item in content_items) / len(content_items) | |
| content_bonus = min(avg_content_length / 1000, 2.0) # Up to 2 point bonus | |
| base_score += content_bonus | |
| return min(base_score, 10.0) # Cap at 10 | |
| def _is_credible_source(self, url: str) -> bool: | |
| """Check if URL is from a credible source""" | |
| if not url: | |
| return False | |
| url_lower = url.lower() | |
| return any(domain in url_lower for domain in self.credible_domains) | |
| def _calculate_quality_score(self, validated_data: Dict) -> float: | |
| """Calculate overall data quality score""" | |
| score = 0.0 | |
| # Points for verified content | |
| verified_count = len(validated_data.get('verified_content', [])) | |
| score += min(verified_count * 1.5, 5.0) # Up to 5 points | |
| # Points for statistics | |
| stats_count = len(validated_data.get('statistics', [])) | |
| score += min(stats_count * 0.5, 3.0) # Up to 3 points | |
| # Points for credible sources | |
| credible_count = len(validated_data.get('credible_sources', [])) | |
| score += min(credible_count * 1.0, 2.0) # Up to 2 points | |
| return min(score, 10.0) | |
| def _extract_metrics_from_text(self, text: str) -> List[Dict]: | |
| """Extract metrics from text content""" | |
| metrics = [] | |
| # Look for key performance indicators | |
| kpi_patterns = [ | |
| r'ROI.*?(\d+(?:\.\d+)?%)', | |
| r'revenue.*?(\$\d+(?:,\d{3})*(?:\.\d+)?)', | |
| r'growth.*?(\d+(?:\.\d+)?%)', | |
| r'market share.*?(\d+(?:\.\d+)?%)', | |
| r'efficiency.*?(\d+(?:\.\d+)?%)', | |
| ] | |
| for pattern in kpi_patterns: | |
| matches = re.finditer(pattern, text, re.IGNORECASE) | |
| for match in matches: | |
| metric = { | |
| 'metric': match.group(1), | |
| 'type': 'kpi', | |
| 'context': match.group(0), | |
| 'confidence': 0.8 | |
| } | |
| metrics.append(metric) | |
| return metrics[:3] # Return top 3 metrics | |
| def _extract_context(self, text: str, start: int, end: int, window: int = 50) -> str: | |
| """Extract context around a statistical match""" | |
| context_start = max(0, start - window) | |
| context_end = min(len(text), end + window) | |
| context = text[context_start:context_end].strip() | |
| return context | |
| def _calculate_stat_confidence(self, value: str, stat_type: str) -> float: | |
| """Calculate confidence score for a statistic""" | |
| confidence = 0.5 # Base confidence | |
| # Higher confidence for certain types | |
| if stat_type in ['percentage', 'currency']: | |
| confidence += 0.3 | |
| # Lower confidence for very round numbers (might be estimates) | |
| if re.match(r'\d+0+', value.replace(',', '').replace('.', '').replace('%', '')): | |
| confidence -= 0.2 | |
| return max(0.1, min(1.0, confidence)) | |
| def fact_check_claim(self, claim: str, context: Dict) -> Dict: | |
| """Basic fact-checking for claims (placeholder for advanced implementation)""" | |
| return { | |
| 'claim': claim, | |
| 'verification_status': 'requires_manual_review', | |
| 'confidence': 0.5, | |
| 'supporting_sources': [], | |
| 'contradicting_sources': [] | |
| } | |