syscred_duplicate / syscred /eeat_calculator.py
DomLoyer's picture
Sync: TREC IR metrics in verify, DB fallback, NER/EEAT fix, all API keys
ea9303b verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
E-E-A-T Metrics Calculator for SysCRED
========================================
Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).
These metrics mirror modern Google ranking signals:
- Experience: Domain age, content freshness
- Expertise: Author identification, depth of content
- Authority: PageRank simulation, citations/backlinks
- Trust: HTTPS, fact-checks, low bias score
"""
from typing import Dict, Any, Optional, List
from dataclasses import dataclass
import re
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
@dataclass
class EEATScore:
"""E-E-A-T score container."""
experience: float # 0-1
expertise: float # 0-1
authority: float # 0-1
trust: float # 0-1
@property
def overall(self) -> float:
"""Weighted average of all E-E-A-T components."""
# Weights based on Google's emphasis
weights = {
'experience': 0.15,
'expertise': 0.25,
'authority': 0.35,
'trust': 0.25
}
return (
self.experience * weights['experience'] +
self.expertise * weights['expertise'] +
self.authority * weights['authority'] +
self.trust * weights['trust']
)
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for JSON serialization."""
return {
'experience': round(self.experience, 3),
'expertise': round(self.expertise, 3),
'authority': round(self.authority, 3),
'trust': round(self.trust, 3),
'overall': round(self.overall, 3),
'experience_pct': f"{int(self.experience * 100)}%",
'expertise_pct': f"{int(self.expertise * 100)}%",
'authority_pct': f"{int(self.authority * 100)}%",
'trust_pct': f"{int(self.trust * 100)}%",
'overall_pct': f"{int(self.overall * 100)}%"
}
class EEATCalculator:
"""
Calculate E-E-A-T metrics from various signals.
Mirrors Google's quality rater guidelines:
- Experience: Has the author demonstrated real experience?
- Expertise: Is the content expert-level?
- Authority: Is the source recognized as authoritative?
- Trust: Is the source trustworthy?
"""
# Known authoritative domains
AUTHORITATIVE_DOMAINS = {
# News
'lemonde.fr': 0.95,
'lefigaro.fr': 0.90,
'liberation.fr': 0.88,
'nytimes.com': 0.95,
'washingtonpost.com': 0.93,
'theguardian.com': 0.92,
'bbc.com': 0.94,
'bbc.co.uk': 0.94,
'reuters.com': 0.96,
'apnews.com': 0.95,
# Academic
'nature.com': 0.98,
'science.org': 0.98,
'pubmed.ncbi.nlm.nih.gov': 0.97,
'scholar.google.com': 0.85,
# Government
'gouv.fr': 0.90,
'gov.uk': 0.90,
'whitehouse.gov': 0.88,
'europa.eu': 0.92,
# Fact-checkers
'snopes.com': 0.88,
'factcheck.org': 0.90,
'politifact.com': 0.88,
'fullfact.org': 0.89,
# Wikipedia (moderate authority)
'wikipedia.org': 0.75,
'fr.wikipedia.org': 0.75,
'en.wikipedia.org': 0.75,
}
# Low-trust domains (misinformation sources)
LOW_TRUST_DOMAINS = {
'infowars.com': 0.1,
'breitbart.com': 0.3,
'naturalnews.com': 0.15,
# Add more as needed
}
def __init__(self):
"""Initialize E-E-A-T calculator."""
pass
def calculate(
self,
url: str,
text: str,
nlp_analysis: Optional[Dict[str, Any]] = None,
pagerank: Optional[float] = None,
fact_checks: Optional[List[Dict]] = None,
domain_age_years: Optional[float] = None,
has_https: bool = True,
author_identified: bool = False,
seo_score: Optional[float] = None
) -> EEATScore:
"""
Calculate E-E-A-T scores from available signals.
Args:
url: Source URL
text: Article text content
nlp_analysis: NLP analysis results (sentiment, coherence, bias)
pagerank: Simulated PageRank score (0-1)
fact_checks: List of fact-check results
domain_age_years: Domain age in years (from WHOIS)
has_https: Whether site uses HTTPS
author_identified: Whether author is clearly identified
seo_score: SEO/technical quality score
Returns:
EEATScore with all component scores
"""
# Extract domain from URL
domain = self._extract_domain(url)
# Calculate each component
experience = self._calculate_experience(
domain_age_years,
text,
nlp_analysis
)
expertise = self._calculate_expertise(
text,
author_identified,
nlp_analysis
)
authority = self._calculate_authority(
domain,
pagerank,
seo_score
)
trust = self._calculate_trust(
domain,
has_https,
fact_checks,
nlp_analysis
)
return EEATScore(
experience=experience,
expertise=expertise,
authority=authority,
trust=trust
)
def _extract_domain(self, url: str) -> str:
"""Extract domain from URL."""
import re
match = re.search(r'https?://(?:www\.)?([^/]+)', url)
return match.group(1).lower() if match else url.lower()
def _calculate_experience(
self,
domain_age_years: Optional[float],
text: str,
nlp_analysis: Optional[Dict]
) -> float:
"""
Calculate Experience score.
Factors:
- Domain age (longer = more experience)
- Content freshness (recently updated)
- First-hand experience indicators in text
"""
score = 0.5 # Base score
# Domain age contribution (max 0.3)
if domain_age_years is not None:
age_score = min(domain_age_years / 20, 1.0) * 0.3 # 20 years = max
score += age_score
else:
score += 0.15 # Assume moderate age
# Content depth contribution (max 0.2)
word_count = len(text.split()) if text else 0
if word_count > 1000:
score += 0.2
elif word_count > 500:
score += 0.15
elif word_count > 200:
score += 0.1
# First-hand experience indicators (max 0.1)
experience_indicators = [
r'\b(j\'ai|je suis|nous avons|I have|we have|in my experience)\b',
r'\b(interview|entretien|témoignage|witness|firsthand)\b',
r'\b(sur place|on the ground|eyewitness)\b'
]
for pattern in experience_indicators:
if re.search(pattern, text, re.IGNORECASE):
score += 0.03
return min(score, 1.0)
def _calculate_expertise(
self,
text: str,
author_identified: bool,
nlp_analysis: Optional[Dict]
) -> float:
"""
Calculate Expertise score.
Factors:
- Author identification
- Technical depth of content
- Citation of sources
- Coherence (from NLP)
"""
score = 0.4 # Base score
# Author identification (0.2)
if author_identified:
score += 0.2
# Citation indicators (max 0.2)
citation_patterns = [
r'\b(selon|according to|d\'après|source:)\b',
r'\b(étude|study|research|rapport|report)\b',
r'\b(expert|spécialiste|chercheur|professor|Dr\.)\b',
r'\[([\d]+)\]', # [1] style citations
r'https?://[^\s]+' # Links
]
citation_count = 0
for pattern in citation_patterns:
citation_count += len(re.findall(pattern, text, re.IGNORECASE))
score += min(citation_count * 0.02, 0.2)
# Coherence from NLP analysis (0.2)
if nlp_analysis and 'coherence' in nlp_analysis:
coherence = nlp_analysis['coherence']
if isinstance(coherence, dict):
coherence = coherence.get('score', 0.5)
score += coherence * 0.2
else:
score += 0.1 # Assume moderate coherence
return min(score, 1.0)
def _calculate_authority(
self,
domain: str,
pagerank: Optional[float],
seo_score: Optional[float]
) -> float:
"""
Calculate Authority score.
Factors:
- Known authoritative domain
- PageRank simulation
- SEO/technical quality
"""
score = 0.3 # Base score
# Known domain authority (max 0.5)
for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
if known_domain in domain:
score = max(score, authority * 0.5 + 0.3)
break
# Check low-trust domains
for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
if low_trust_domain in domain:
score = min(score, low_score)
break
# PageRank contribution (max 0.3)
if pagerank is not None:
score += pagerank * 0.3
else:
score += 0.15 # Assume moderate pagerank
# SEO score contribution (max 0.2)
if seo_score is not None:
score += seo_score * 0.2
else:
score += 0.1
return min(score, 1.0)
def _calculate_trust(
self,
domain: str,
has_https: bool,
fact_checks: Optional[List[Dict]],
nlp_analysis: Optional[Dict]
) -> float:
"""
Calculate Trust score.
Factors:
- HTTPS
- Fact-check results
- Bias score (low = better)
- Known trustworthy domain
"""
score = 0.4 # Base score
# HTTPS (0.1)
if has_https:
score += 0.1
# Fact-check results (max 0.3)
if fact_checks:
positive_checks = sum(1 for fc in fact_checks
if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
negative_checks = sum(1 for fc in fact_checks
if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])
if positive_checks > 0:
score += 0.2
if negative_checks > 0:
score -= 0.3
# Bias score (max 0.2, lower bias = higher trust)
if nlp_analysis:
bias_data = nlp_analysis.get('bias_analysis', {})
if isinstance(bias_data, dict):
bias_score = bias_data.get('score', 0.3)
else:
bias_score = 0.3
# Invert: low bias = high trust contribution
score += (1 - bias_score) * 0.2
else:
score += 0.1
# Known trustworthy domain (0.1)
for known_domain in self.AUTHORITATIVE_DOMAINS:
if known_domain in domain:
score += 0.1
break
# Known low-trust domain (penalty)
for low_trust_domain in self.LOW_TRUST_DOMAINS:
if low_trust_domain in domain:
score -= 0.3
break
return max(min(score, 1.0), 0.0)
def explain_score(self, eeat: EEATScore, url: str) -> str:
"""
Generate human-readable explanation of E-E-A-T score.
Args:
eeat: EEATScore instance
url: Source URL
Returns:
Formatted explanation string
"""
domain = self._extract_domain(url)
explanations = []
# Experience
if eeat.experience >= 0.8:
explanations.append(f"✅ **Expérience élevée** ({eeat.experience_pct}): Source établie depuis longtemps")
elif eeat.experience >= 0.5:
explanations.append(f"🔶 **Expérience moyenne** ({eeat.experience_pct}): Source modérément établie")
else:
explanations.append(f"⚠️ **Expérience faible** ({eeat.experience_pct}): Source récente ou peu connue")
# Expertise
if eeat.expertise >= 0.8:
explanations.append(f"✅ **Expertise élevée** ({eeat.expertise_pct}): Contenu approfondi avec citations")
elif eeat.expertise >= 0.5:
explanations.append(f"🔶 **Expertise moyenne** ({eeat.expertise_pct}): Contenu standard")
else:
explanations.append(f"⚠️ **Expertise faible** ({eeat.expertise_pct}): Manque de profondeur")
# Authority
if eeat.authority >= 0.8:
explanations.append(f"✅ **Autorité élevée** ({eeat.authority_pct}): Source très citée et reconnue")
elif eeat.authority >= 0.5:
explanations.append(f"🔶 **Autorité moyenne** ({eeat.authority_pct}): Source modérément reconnue")
else:
explanations.append(f"⚠️ **Autorité faible** ({eeat.authority_pct}): Peu de citations externes")
# Trust
if eeat.trust >= 0.8:
explanations.append(f"✅ **Confiance élevée** ({eeat.trust_pct}): Faits vérifiés, pas de biais")
elif eeat.trust >= 0.5:
explanations.append(f"🔶 **Confiance moyenne** ({eeat.trust_pct}): Quelques signaux de confiance")
else:
explanations.append(f"⚠️ **Confiance faible** ({eeat.trust_pct}): Prudence recommandée")
return "\n".join(explanations)
# Test
if __name__ == "__main__":
calc = EEATCalculator()
test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
test_text = """
Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
au Capitol a été un événement marquant. Notre reporter sur place a témoigné
des événements. Les experts politiques analysent les conséquences.
"""
nlp_analysis = {
'coherence': {'score': 0.8},
'bias_analysis': {'score': 0.2}
}
eeat = calc.calculate(
url=test_url,
text=test_text,
nlp_analysis=nlp_analysis,
pagerank=0.7,
has_https=True,
author_identified=True
)
print("=== E-E-A-T Scores ===")
print(f"Experience: {eeat.experience_pct}")
print(f"Expertise: {eeat.expertise_pct}")
print(f"Authority: {eeat.authority_pct}")
print(f"Trust: {eeat.trust_pct}")
print(f"Overall: {eeat.overall_pct}")
print("\n=== Explanation ===")
print(calc.explain_score(eeat, test_url))