syscred_duplicate

Sleeping

File size: 15,223 Bytes

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
E-E-A-T Metrics Calculator for SysCRED
========================================
Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).

These metrics mirror modern Google ranking signals:
- Experience: Domain age, content freshness
- Expertise: Author identification, depth of content
- Authority: PageRank simulation, citations/backlinks
- Trust: HTTPS, fact-checks, low bias score
"""

from typing import Dict, Any, Optional, List
from dataclasses import dataclass
import re
from datetime import datetime
import logging

logger = logging.getLogger(__name__)


@dataclass
class EEATScore:
    """E-E-A-T score container."""
    experience: float  # 0-1
    expertise: float   # 0-1
    authority: float   # 0-1
    trust: float       # 0-1
    
    @property
    def overall(self) -> float:
        """Weighted average of all E-E-A-T components."""
        # Weights based on Google's emphasis
        weights = {
            'experience': 0.15,
            'expertise': 0.25,
            'authority': 0.35,
            'trust': 0.25
        }
        return (
            self.experience * weights['experience'] +
            self.expertise * weights['expertise'] +
            self.authority * weights['authority'] +
            self.trust * weights['trust']
        )
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            'experience': round(self.experience, 3),
            'expertise': round(self.expertise, 3),
            'authority': round(self.authority, 3),
            'trust': round(self.trust, 3),
            'overall': round(self.overall, 3),
            'experience_pct': f"{int(self.experience * 100)}%",
            'expertise_pct': f"{int(self.expertise * 100)}%",
            'authority_pct': f"{int(self.authority * 100)}%",
            'trust_pct': f"{int(self.trust * 100)}%",
            'overall_pct': f"{int(self.overall * 100)}%"
        }


class EEATCalculator:
    """
    Calculate E-E-A-T metrics from various signals.
    
    Mirrors Google's quality rater guidelines:
    - Experience: Has the author demonstrated real experience?
    - Expertise: Is the content expert-level?
    - Authority: Is the source recognized as authoritative?
    - Trust: Is the source trustworthy?
    """
    
    # Known authoritative domains
    AUTHORITATIVE_DOMAINS = {
        # News
        'lemonde.fr': 0.95,
        'lefigaro.fr': 0.90,
        'liberation.fr': 0.88,
        'nytimes.com': 0.95,
        'washingtonpost.com': 0.93,
        'theguardian.com': 0.92,
        'bbc.com': 0.94,
        'bbc.co.uk': 0.94,
        'reuters.com': 0.96,
        'apnews.com': 0.95,
        # Academic
        'nature.com': 0.98,
        'science.org': 0.98,
        'pubmed.ncbi.nlm.nih.gov': 0.97,
        'scholar.google.com': 0.85,
        # Government
        'gouv.fr': 0.90,
        'gov.uk': 0.90,
        'whitehouse.gov': 0.88,
        'europa.eu': 0.92,
        # Fact-checkers
        'snopes.com': 0.88,
        'factcheck.org': 0.90,
        'politifact.com': 0.88,
        'fullfact.org': 0.89,
        # Wikipedia (moderate authority)
        'wikipedia.org': 0.75,
        'fr.wikipedia.org': 0.75,
        'en.wikipedia.org': 0.75,
    }
    
    # Low-trust domains (misinformation sources)
    LOW_TRUST_DOMAINS = {
        'infowars.com': 0.1,
        'breitbart.com': 0.3,
        'naturalnews.com': 0.15,
        # Add more as needed
    }
    
    def __init__(self):
        """Initialize E-E-A-T calculator."""
        pass
    
    def calculate(
        self,
        url: str,
        text: str,
        nlp_analysis: Optional[Dict[str, Any]] = None,
        pagerank: Optional[float] = None,
        fact_checks: Optional[List[Dict]] = None,
        domain_age_years: Optional[float] = None,
        has_https: bool = True,
        author_identified: bool = False,
        seo_score: Optional[float] = None
    ) -> EEATScore:
        """
        Calculate E-E-A-T scores from available signals.
        
        Args:
            url: Source URL
            text: Article text content
            nlp_analysis: NLP analysis results (sentiment, coherence, bias)
            pagerank: Simulated PageRank score (0-1)
            fact_checks: List of fact-check results
            domain_age_years: Domain age in years (from WHOIS)
            has_https: Whether site uses HTTPS
            author_identified: Whether author is clearly identified
            seo_score: SEO/technical quality score
            
        Returns:
            EEATScore with all component scores
        """
        # Extract domain from URL
        domain = self._extract_domain(url)
        
        # Calculate each component
        experience = self._calculate_experience(
            domain_age_years, 
            text, 
            nlp_analysis
        )
        
        expertise = self._calculate_expertise(
            text, 
            author_identified, 
            nlp_analysis
        )
        
        authority = self._calculate_authority(
            domain, 
            pagerank, 
            seo_score
        )
        
        trust = self._calculate_trust(
            domain, 
            has_https, 
            fact_checks, 
            nlp_analysis
        )
        
        return EEATScore(
            experience=experience,
            expertise=expertise,
            authority=authority,
            trust=trust
        )
    
    def _extract_domain(self, url: str) -> str:
        """Extract domain from URL."""
        import re
        match = re.search(r'https?://(?:www\.)?([^/]+)', url)
        return match.group(1).lower() if match else url.lower()
    
    def _calculate_experience(
        self,
        domain_age_years: Optional[float],
        text: str,
        nlp_analysis: Optional[Dict]
    ) -> float:
        """
        Calculate Experience score.
        
        Factors:
        - Domain age (longer = more experience)
        - Content freshness (recently updated)
        - First-hand experience indicators in text
        """
        score = 0.5  # Base score
        
        # Domain age contribution (max 0.3)
        if domain_age_years is not None:
            age_score = min(domain_age_years / 20, 1.0) * 0.3  # 20 years = max
            score += age_score
        else:
            score += 0.15  # Assume moderate age
        
        # Content depth contribution (max 0.2)
        word_count = len(text.split()) if text else 0
        if word_count > 1000:
            score += 0.2
        elif word_count > 500:
            score += 0.15
        elif word_count > 200:
            score += 0.1
        
        # First-hand experience indicators (max 0.1)
        experience_indicators = [
            r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
            r'\b(interview|entretien|témoignage|witness|firsthand)\b',
            r'\b(sur place|on the ground|eyewitness)\b'
        ]
        for pattern in experience_indicators:
            if re.search(pattern, text, re.IGNORECASE):
                score += 0.03
        
        return min(score, 1.0)
    
    def _calculate_expertise(
        self,
        text: str,
        author_identified: bool,
        nlp_analysis: Optional[Dict]
    ) -> float:
        """
        Calculate Expertise score.
        
        Factors:
        - Author identification
        - Technical depth of content
        - Citation of sources
        - Coherence (from NLP)
        """
        score = 0.4  # Base score
        
        # Author identification (0.2)
        if author_identified:
            score += 0.2
        
        # Citation indicators (max 0.2)
        citation_patterns = [
            r'\b(selon|according to|d\'après|source:)\b',
            r'\b(étude|study|research|rapport|report)\b',
            r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
            r'\[([\d]+)\]',  # [1] style citations
            r'https?://[^\s]+'  # Links
        ]
        citation_count = 0
        for pattern in citation_patterns:
            citation_count += len(re.findall(pattern, text, re.IGNORECASE))
        score += min(citation_count * 0.02, 0.2)
        
        # Coherence from NLP analysis (0.2)
        if nlp_analysis and 'coherence' in nlp_analysis:
            coherence = nlp_analysis['coherence']
            if isinstance(coherence, dict):
                coherence = coherence.get('score', 0.5)
            score += coherence * 0.2
        else:
            score += 0.1  # Assume moderate coherence
        
        return min(score, 1.0)
    
    def _calculate_authority(
        self,
        domain: str,
        pagerank: Optional[float],
        seo_score: Optional[float]
    ) -> float:
        """
        Calculate Authority score.
        
        Factors:
        - Known authoritative domain
        - PageRank simulation
        - SEO/technical quality
        """
        score = 0.3  # Base score
        
        # Known domain authority (max 0.5)
        for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
            if known_domain in domain:
                score = max(score, authority * 0.5 + 0.3)
                break
        
        # Check low-trust domains
        for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
            if low_trust_domain in domain:
                score = min(score, low_score)
                break
        
        # PageRank contribution (max 0.3)
        if pagerank is not None:
            score += pagerank * 0.3
        else:
            score += 0.15  # Assume moderate pagerank
        
        # SEO score contribution (max 0.2)
        if seo_score is not None:
            score += seo_score * 0.2
        else:
            score += 0.1
        
        return min(score, 1.0)
    
    def _calculate_trust(
        self,
        domain: str,
        has_https: bool,
        fact_checks: Optional[List[Dict]],
        nlp_analysis: Optional[Dict]
    ) -> float:
        """
        Calculate Trust score.
        
        Factors:
        - HTTPS
        - Fact-check results
        - Bias score (low = better)
        - Known trustworthy domain
        """
        score = 0.4  # Base score
        
        # HTTPS (0.1)
        if has_https:
            score += 0.1
        
        # Fact-check results (max 0.3)
        if fact_checks:
            positive_checks = sum(1 for fc in fact_checks 
                                  if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
            negative_checks = sum(1 for fc in fact_checks 
                                  if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
            
            if positive_checks > 0:
                score += 0.2
            if negative_checks > 0:
                score -= 0.3
        
        # Bias score (max 0.2, lower bias = higher trust)
        if nlp_analysis:
            bias_data = nlp_analysis.get('bias_analysis', {})
            if isinstance(bias_data, dict):
                bias_score = bias_data.get('score', 0.3)
            else:
                bias_score = 0.3
            # Invert: low bias = high trust contribution
            score += (1 - bias_score) * 0.2
        else:
            score += 0.1
        
        # Known trustworthy domain (0.1)
        for known_domain in self.AUTHORITATIVE_DOMAINS:
            if known_domain in domain:
                score += 0.1
                break
        
        # Known low-trust domain (penalty)
        for low_trust_domain in self.LOW_TRUST_DOMAINS:
            if low_trust_domain in domain:
                score -= 0.3
                break
        
        return max(min(score, 1.0), 0.0)
    
    def explain_score(self, eeat: EEATScore, url: str) -> str:
        """
        Generate human-readable explanation of E-E-A-T score.
        
        Args:
            eeat: EEATScore instance
            url: Source URL
            
        Returns:
            Formatted explanation string
        """
        domain = self._extract_domain(url)
        
        explanations = []
        
        # Experience
        if eeat.experience >= 0.8:
            explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
        elif eeat.experience >= 0.5:
            explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
        else:
            explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
        
        # Expertise
        if eeat.expertise >= 0.8:
            explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
        elif eeat.expertise >= 0.5:
            explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
        else:
            explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
        
        # Authority
        if eeat.authority >= 0.8:
            explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
        elif eeat.authority >= 0.5:
            explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
        else:
            explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
        
        # Trust
        if eeat.trust >= 0.8:
            explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
        elif eeat.trust >= 0.5:
            explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
        else:
            explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
        
        return "\n".join(explanations)


# Test
if __name__ == "__main__":
    calc = EEATCalculator()
    
    test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
    test_text = """
    Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021 
    au Capitol a été un événement marquant. Notre reporter sur place a témoigné
    des événements. Les experts politiques analysent les conséquences.
    """
    
    nlp_analysis = {
        'coherence': {'score': 0.8},
        'bias_analysis': {'score': 0.2}
    }
    
    eeat = calc.calculate(
        url=test_url,
        text=test_text,
        nlp_analysis=nlp_analysis,
        pagerank=0.7,
        has_https=True,
        author_identified=True
    )
    
    print("=== E-E-A-T Scores ===")
    print(f"Experience: {eeat.experience_pct}")
    print(f"Expertise:  {eeat.expertise_pct}")
    print(f"Authority:  {eeat.authority_pct}")
    print(f"Trust:      {eeat.trust_pct}")
    print(f"Overall:    {eeat.overall_pct}")
    print("\n=== Explanation ===")
    print(calc.explain_score(eeat, test_url))