Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| E-E-A-T Metrics Calculator for SysCRED | |
| ======================================== | |
| Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust). | |
| These metrics mirror modern Google ranking signals: | |
| - Experience: Domain age, content freshness | |
| - Expertise: Author identification, depth of content | |
| - Authority: PageRank simulation, citations/backlinks | |
| - Trust: HTTPS, fact-checks, low bias score | |
| """ | |
| from typing import Dict, Any, Optional, List | |
| from dataclasses import dataclass | |
| import re | |
| from datetime import datetime | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class EEATScore: | |
| """E-E-A-T score container.""" | |
| experience: float # 0-1 | |
| expertise: float # 0-1 | |
| authority: float # 0-1 | |
| trust: float # 0-1 | |
| def overall(self) -> float: | |
| """Weighted average of all E-E-A-T components.""" | |
| # Weights based on Google's emphasis | |
| weights = { | |
| 'experience': 0.15, | |
| 'expertise': 0.25, | |
| 'authority': 0.35, | |
| 'trust': 0.25 | |
| } | |
| return ( | |
| self.experience * weights['experience'] + | |
| self.expertise * weights['expertise'] + | |
| self.authority * weights['authority'] + | |
| self.trust * weights['trust'] | |
| ) | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Convert to dictionary for JSON serialization.""" | |
| return { | |
| 'experience': round(self.experience, 3), | |
| 'expertise': round(self.expertise, 3), | |
| 'authority': round(self.authority, 3), | |
| 'trust': round(self.trust, 3), | |
| 'overall': round(self.overall, 3), | |
| 'experience_pct': f"{int(self.experience * 100)}%", | |
| 'expertise_pct': f"{int(self.expertise * 100)}%", | |
| 'authority_pct': f"{int(self.authority * 100)}%", | |
| 'trust_pct': f"{int(self.trust * 100)}%", | |
| 'overall_pct': f"{int(self.overall * 100)}%" | |
| } | |
| class EEATCalculator: | |
| """ | |
| Calculate E-E-A-T metrics from various signals. | |
| Mirrors Google's quality rater guidelines: | |
| - Experience: Has the author demonstrated real experience? | |
| - Expertise: Is the content expert-level? | |
| - Authority: Is the source recognized as authoritative? | |
| - Trust: Is the source trustworthy? | |
| """ | |
| # Known authoritative domains | |
| AUTHORITATIVE_DOMAINS = { | |
| # News | |
| 'lemonde.fr': 0.95, | |
| 'lefigaro.fr': 0.90, | |
| 'liberation.fr': 0.88, | |
| 'nytimes.com': 0.95, | |
| 'washingtonpost.com': 0.93, | |
| 'theguardian.com': 0.92, | |
| 'bbc.com': 0.94, | |
| 'bbc.co.uk': 0.94, | |
| 'reuters.com': 0.96, | |
| 'apnews.com': 0.95, | |
| # Academic | |
| 'nature.com': 0.98, | |
| 'science.org': 0.98, | |
| 'pubmed.ncbi.nlm.nih.gov': 0.97, | |
| 'scholar.google.com': 0.85, | |
| # Government | |
| 'gouv.fr': 0.90, | |
| 'gov.uk': 0.90, | |
| 'whitehouse.gov': 0.88, | |
| 'europa.eu': 0.92, | |
| # Fact-checkers | |
| 'snopes.com': 0.88, | |
| 'factcheck.org': 0.90, | |
| 'politifact.com': 0.88, | |
| 'fullfact.org': 0.89, | |
| # Wikipedia (moderate authority) | |
| 'wikipedia.org': 0.75, | |
| 'fr.wikipedia.org': 0.75, | |
| 'en.wikipedia.org': 0.75, | |
| } | |
| # Low-trust domains (misinformation sources) | |
| LOW_TRUST_DOMAINS = { | |
| 'infowars.com': 0.1, | |
| 'breitbart.com': 0.3, | |
| 'naturalnews.com': 0.15, | |
| # Add more as needed | |
| } | |
| def __init__(self): | |
| """Initialize E-E-A-T calculator.""" | |
| pass | |
| def calculate( | |
| self, | |
| url: str, | |
| text: str, | |
| nlp_analysis: Optional[Dict[str, Any]] = None, | |
| pagerank: Optional[float] = None, | |
| fact_checks: Optional[List[Dict]] = None, | |
| domain_age_years: Optional[float] = None, | |
| has_https: bool = True, | |
| author_identified: bool = False, | |
| seo_score: Optional[float] = None | |
| ) -> EEATScore: | |
| """ | |
| Calculate E-E-A-T scores from available signals. | |
| Args: | |
| url: Source URL | |
| text: Article text content | |
| nlp_analysis: NLP analysis results (sentiment, coherence, bias) | |
| pagerank: Simulated PageRank score (0-1) | |
| fact_checks: List of fact-check results | |
| domain_age_years: Domain age in years (from WHOIS) | |
| has_https: Whether site uses HTTPS | |
| author_identified: Whether author is clearly identified | |
| seo_score: SEO/technical quality score | |
| Returns: | |
| EEATScore with all component scores | |
| """ | |
| # Extract domain from URL | |
| domain = self._extract_domain(url) | |
| # Calculate each component | |
| experience = self._calculate_experience( | |
| domain_age_years, | |
| text, | |
| nlp_analysis | |
| ) | |
| expertise = self._calculate_expertise( | |
| text, | |
| author_identified, | |
| nlp_analysis | |
| ) | |
| authority = self._calculate_authority( | |
| domain, | |
| pagerank, | |
| seo_score | |
| ) | |
| trust = self._calculate_trust( | |
| domain, | |
| has_https, | |
| fact_checks, | |
| nlp_analysis | |
| ) | |
| return EEATScore( | |
| experience=experience, | |
| expertise=expertise, | |
| authority=authority, | |
| trust=trust | |
| ) | |
| def _extract_domain(self, url: str) -> str: | |
| """Extract domain from URL.""" | |
| import re | |
| match = re.search(r'https?://(?:www\.)?([^/]+)', url) | |
| return match.group(1).lower() if match else url.lower() | |
| def _calculate_experience( | |
| self, | |
| domain_age_years: Optional[float], | |
| text: str, | |
| nlp_analysis: Optional[Dict] | |
| ) -> float: | |
| """ | |
| Calculate Experience score. | |
| Factors: | |
| - Domain age (longer = more experience) | |
| - Content freshness (recently updated) | |
| - First-hand experience indicators in text | |
| """ | |
| score = 0.5 # Base score | |
| # Domain age contribution (max 0.3) | |
| if domain_age_years is not None: | |
| age_score = min(domain_age_years / 20, 1.0) * 0.3 # 20 years = max | |
| score += age_score | |
| else: | |
| score += 0.15 # Assume moderate age | |
| # Content depth contribution (max 0.2) | |
| word_count = len(text.split()) if text else 0 | |
| if word_count > 1000: | |
| score += 0.2 | |
| elif word_count > 500: | |
| score += 0.15 | |
| elif word_count > 200: | |
| score += 0.1 | |
| # First-hand experience indicators (max 0.1) | |
| experience_indicators = [ | |
| r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b', | |
| r'\b(interview|entretien|témoignage|witness|firsthand)\b', | |
| r'\b(sur place|on the ground|eyewitness)\b' | |
| ] | |
| for pattern in experience_indicators: | |
| if re.search(pattern, text, re.IGNORECASE): | |
| score += 0.03 | |
| return min(score, 1.0) | |
| def _calculate_expertise( | |
| self, | |
| text: str, | |
| author_identified: bool, | |
| nlp_analysis: Optional[Dict] | |
| ) -> float: | |
| """ | |
| Calculate Expertise score. | |
| Factors: | |
| - Author identification | |
| - Technical depth of content | |
| - Citation of sources | |
| - Coherence (from NLP) | |
| """ | |
| score = 0.4 # Base score | |
| # Author identification (0.2) | |
| if author_identified: | |
| score += 0.2 | |
| # Citation indicators (max 0.2) | |
| citation_patterns = [ | |
| r'\b(selon|according to|d\'après|source:)\b', | |
| r'\b(étude|study|research|rapport|report)\b', | |
| r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b', | |
| r'\[([\d]+)\]', # [1] style citations | |
| r'https?://[^\s]+' # Links | |
| ] | |
| citation_count = 0 | |
| for pattern in citation_patterns: | |
| citation_count += len(re.findall(pattern, text, re.IGNORECASE)) | |
| score += min(citation_count * 0.02, 0.2) | |
| # Coherence from NLP analysis (0.2) | |
| if nlp_analysis and 'coherence' in nlp_analysis: | |
| coherence = nlp_analysis['coherence'] | |
| if isinstance(coherence, dict): | |
| coherence = coherence.get('score', 0.5) | |
| score += coherence * 0.2 | |
| else: | |
| score += 0.1 # Assume moderate coherence | |
| return min(score, 1.0) | |
| def _calculate_authority( | |
| self, | |
| domain: str, | |
| pagerank: Optional[float], | |
| seo_score: Optional[float] | |
| ) -> float: | |
| """ | |
| Calculate Authority score. | |
| Factors: | |
| - Known authoritative domain | |
| - PageRank simulation | |
| - SEO/technical quality | |
| """ | |
| score = 0.3 # Base score | |
| # Known domain authority (max 0.5) | |
| for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items(): | |
| if known_domain in domain: | |
| score = max(score, authority * 0.5 + 0.3) | |
| break | |
| # Check low-trust domains | |
| for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items(): | |
| if low_trust_domain in domain: | |
| score = min(score, low_score) | |
| break | |
| # PageRank contribution (max 0.3) | |
| if pagerank is not None: | |
| score += pagerank * 0.3 | |
| else: | |
| score += 0.15 # Assume moderate pagerank | |
| # SEO score contribution (max 0.2) | |
| if seo_score is not None: | |
| score += seo_score * 0.2 | |
| else: | |
| score += 0.1 | |
| return min(score, 1.0) | |
| def _calculate_trust( | |
| self, | |
| domain: str, | |
| has_https: bool, | |
| fact_checks: Optional[List[Dict]], | |
| nlp_analysis: Optional[Dict] | |
| ) -> float: | |
| """ | |
| Calculate Trust score. | |
| Factors: | |
| - HTTPS | |
| - Fact-check results | |
| - Bias score (low = better) | |
| - Known trustworthy domain | |
| """ | |
| score = 0.4 # Base score | |
| # HTTPS (0.1) | |
| if has_https: | |
| score += 0.1 | |
| # Fact-check results (max 0.3) | |
| if fact_checks: | |
| positive_checks = sum(1 for fc in fact_checks | |
| if fc.get('rating', '').lower() in ['true', 'vrai', 'correct']) | |
| negative_checks = sum(1 for fc in fact_checks | |
| if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire']) | |
| if positive_checks > 0: | |
| score += 0.2 | |
| if negative_checks > 0: | |
| score -= 0.3 | |
| # Bias score (max 0.2, lower bias = higher trust) | |
| if nlp_analysis: | |
| bias_data = nlp_analysis.get('bias_analysis', {}) | |
| if isinstance(bias_data, dict): | |
| bias_score = bias_data.get('score', 0.3) | |
| else: | |
| bias_score = 0.3 | |
| # Invert: low bias = high trust contribution | |
| score += (1 - bias_score) * 0.2 | |
| else: | |
| score += 0.1 | |
| # Known trustworthy domain (0.1) | |
| for known_domain in self.AUTHORITATIVE_DOMAINS: | |
| if known_domain in domain: | |
| score += 0.1 | |
| break | |
| # Known low-trust domain (penalty) | |
| for low_trust_domain in self.LOW_TRUST_DOMAINS: | |
| if low_trust_domain in domain: | |
| score -= 0.3 | |
| break | |
| return max(min(score, 1.0), 0.0) | |
| def explain_score(self, eeat: EEATScore, url: str) -> str: | |
| """ | |
| Generate human-readable explanation of E-E-A-T score. | |
| Args: | |
| eeat: EEATScore instance | |
| url: Source URL | |
| Returns: | |
| Formatted explanation string | |
| """ | |
| domain = self._extract_domain(url) | |
| explanations = [] | |
| # Experience | |
| if eeat.experience >= 0.8: | |
| explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps") | |
| elif eeat.experience >= 0.5: | |
| explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie") | |
| else: | |
| explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue") | |
| # Expertise | |
| if eeat.expertise >= 0.8: | |
| explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations") | |
| elif eeat.expertise >= 0.5: | |
| explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard") | |
| else: | |
| explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur") | |
| # Authority | |
| if eeat.authority >= 0.8: | |
| explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue") | |
| elif eeat.authority >= 0.5: | |
| explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue") | |
| else: | |
| explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes") | |
| # Trust | |
| if eeat.trust >= 0.8: | |
| explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais") | |
| elif eeat.trust >= 0.5: | |
| explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance") | |
| else: | |
| explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée") | |
| return "\n".join(explanations) | |
| # Test | |
| if __name__ == "__main__": | |
| calc = EEATCalculator() | |
| test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html" | |
| test_text = """ | |
| Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021 | |
| au Capitol a été un événement marquant. Notre reporter sur place a témoigné | |
| des événements. Les experts politiques analysent les conséquences. | |
| """ | |
| nlp_analysis = { | |
| 'coherence': {'score': 0.8}, | |
| 'bias_analysis': {'score': 0.2} | |
| } | |
| eeat = calc.calculate( | |
| url=test_url, | |
| text=test_text, | |
| nlp_analysis=nlp_analysis, | |
| pagerank=0.7, | |
| has_https=True, | |
| author_identified=True | |
| ) | |
| print("=== E-E-A-T Scores ===") | |
| print(f"Experience: {eeat.experience_pct}") | |
| print(f"Expertise: {eeat.expertise_pct}") | |
| print(f"Authority: {eeat.authority_pct}") | |
| print(f"Trust: {eeat.trust_pct}") | |
| print(f"Overall: {eeat.overall_pct}") | |
| print("\n=== Explanation ===") | |
| print(calc.explain_score(eeat, test_url)) | |