syscred_duplicate

Sleeping

App Files Files Community

syscred_duplicate / syscred /eeat_calculator.py

DomLoyer

Sync: TREC IR metrics in verify, DB fallback, NER/EEAT fix, all API keys

ea9303b verified 20 days ago

raw

history blame contribute delete

15.2 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	E-E-A-T Metrics Calculator for SysCRED
	========================================
	Calculates Google-style E-E-A-T metrics (Experience, Expertise, Authority, Trust).

	These metrics mirror modern Google ranking signals:
	- Experience: Domain age, content freshness
	- Expertise: Author identification, depth of content
	- Authority: PageRank simulation, citations/backlinks
	- Trust: HTTPS, fact-checks, low bias score
	"""

	from typing import Dict, Any, Optional, List
	from dataclasses import dataclass
	import re
	from datetime import datetime
	import logging

	logger = logging.getLogger(__name__)


	@dataclass
	class EEATScore:
	"""E-E-A-T score container."""
	experience: float # 0-1
	expertise: float # 0-1
	authority: float # 0-1
	trust: float # 0-1

	@property
	def overall(self) -> float:
	"""Weighted average of all E-E-A-T components."""
	# Weights based on Google's emphasis
	weights = {
	'experience': 0.15,
	'expertise': 0.25,
	'authority': 0.35,
	'trust': 0.25
	}
	return (
	self.experience * weights['experience'] +
	self.expertise * weights['expertise'] +
	self.authority * weights['authority'] +
	self.trust * weights['trust']
	)

	def to_dict(self) -> Dict[str, Any]:
	"""Convert to dictionary for JSON serialization."""
	return {
	'experience': round(self.experience, 3),
	'expertise': round(self.expertise, 3),
	'authority': round(self.authority, 3),
	'trust': round(self.trust, 3),
	'overall': round(self.overall, 3),
	'experience_pct': f"{int(self.experience * 100)}%",
	'expertise_pct': f"{int(self.expertise * 100)}%",
	'authority_pct': f"{int(self.authority * 100)}%",
	'trust_pct': f"{int(self.trust * 100)}%",
	'overall_pct': f"{int(self.overall * 100)}%"
	}


	class EEATCalculator:
	"""
	Calculate E-E-A-T metrics from various signals.

	Mirrors Google's quality rater guidelines:
	- Experience: Has the author demonstrated real experience?
	- Expertise: Is the content expert-level?
	- Authority: Is the source recognized as authoritative?
	- Trust: Is the source trustworthy?
	"""

	# Known authoritative domains
	AUTHORITATIVE_DOMAINS = {
	# News
	'lemonde.fr': 0.95,
	'lefigaro.fr': 0.90,
	'liberation.fr': 0.88,
	'nytimes.com': 0.95,
	'washingtonpost.com': 0.93,
	'theguardian.com': 0.92,
	'bbc.com': 0.94,
	'bbc.co.uk': 0.94,
	'reuters.com': 0.96,
	'apnews.com': 0.95,
	# Academic
	'nature.com': 0.98,
	'science.org': 0.98,
	'pubmed.ncbi.nlm.nih.gov': 0.97,
	'scholar.google.com': 0.85,
	# Government
	'gouv.fr': 0.90,
	'gov.uk': 0.90,
	'whitehouse.gov': 0.88,
	'europa.eu': 0.92,
	# Fact-checkers
	'snopes.com': 0.88,
	'factcheck.org': 0.90,
	'politifact.com': 0.88,
	'fullfact.org': 0.89,
	# Wikipedia (moderate authority)
	'wikipedia.org': 0.75,
	'fr.wikipedia.org': 0.75,
	'en.wikipedia.org': 0.75,
	}

	# Low-trust domains (misinformation sources)
	LOW_TRUST_DOMAINS = {
	'infowars.com': 0.1,
	'breitbart.com': 0.3,
	'naturalnews.com': 0.15,
	# Add more as needed
	}

	def __init__(self):
	"""Initialize E-E-A-T calculator."""
	pass

	def calculate(
	self,
	url: str,
	text: str,
	nlp_analysis: Optional[Dict[str, Any]] = None,
	pagerank: Optional[float] = None,
	fact_checks: Optional[List[Dict]] = None,
	domain_age_years: Optional[float] = None,
	has_https: bool = True,
	author_identified: bool = False,
	seo_score: Optional[float] = None
	) -> EEATScore:
	"""
	Calculate E-E-A-T scores from available signals.

	Args:
	url: Source URL
	text: Article text content
	nlp_analysis: NLP analysis results (sentiment, coherence, bias)
	pagerank: Simulated PageRank score (0-1)
	fact_checks: List of fact-check results
	domain_age_years: Domain age in years (from WHOIS)
	has_https: Whether site uses HTTPS
	author_identified: Whether author is clearly identified
	seo_score: SEO/technical quality score

	Returns:
	EEATScore with all component scores
	"""
	# Extract domain from URL
	domain = self._extract_domain(url)

	# Calculate each component
	experience = self._calculate_experience(
	domain_age_years,
	text,
	nlp_analysis
	)

	expertise = self._calculate_expertise(
	text,
	author_identified,
	nlp_analysis
	)

	authority = self._calculate_authority(
	domain,
	pagerank,
	seo_score
	)

	trust = self._calculate_trust(
	domain,
	has_https,
	fact_checks,
	nlp_analysis
	)

	return EEATScore(
	experience=experience,
	expertise=expertise,
	authority=authority,
	trust=trust
	)

	def _extract_domain(self, url: str) -> str:
	"""Extract domain from URL."""
	import re
	match = re.search(r'https?://(?:www\.)?([^/]+)', url)
	return match.group(1).lower() if match else url.lower()

	def _calculate_experience(
	self,
	domain_age_years: Optional[float],
	text: str,
	nlp_analysis: Optional[Dict]
	) -> float:
	"""
	Calculate Experience score.

	Factors:
	- Domain age (longer = more experience)
	- Content freshness (recently updated)
	- First-hand experience indicators in text
	"""
	score = 0.5 # Base score

	# Domain age contribution (max 0.3)
	if domain_age_years is not None:
	age_score = min(domain_age_years / 20, 1.0) * 0.3 # 20 years = max
	score += age_score
	else:
	score += 0.15 # Assume moderate age

	# Content depth contribution (max 0.2)
	word_count = len(text.split()) if text else 0
	if word_count > 1000:
	score += 0.2
	elif word_count > 500:
	score += 0.15
	elif word_count > 200:
	score += 0.1

	# First-hand experience indicators (max 0.1)
	experience_indicators = [
	r'\b(j\'ai\|je suis\|nous avons\|I have\|we have\|in my experience)\b',
	r'\b(interview\|entretien\|témoignage\|witness\|firsthand)\b',
	r'\b(sur place\|on the ground\|eyewitness)\b'
	]
	for pattern in experience_indicators:
	if re.search(pattern, text, re.IGNORECASE):
	score += 0.03

	return min(score, 1.0)

	def _calculate_expertise(
	self,
	text: str,
	author_identified: bool,
	nlp_analysis: Optional[Dict]
	) -> float:
	"""
	Calculate Expertise score.

	Factors:
	- Author identification
	- Technical depth of content
	- Citation of sources
	- Coherence (from NLP)
	"""
	score = 0.4 # Base score

	# Author identification (0.2)
	if author_identified:
	score += 0.2

	# Citation indicators (max 0.2)
	citation_patterns = [
	r'\b(selon\|according to\|d\'après\|source:)\b',
	r'\b(étude\|study\|research\|rapport\|report)\b',
	r'\b(expert\|spécialiste\|chercheur\|professor\|Dr\.)\b',
	r'\[([\d]+)\]', # [1] style citations
	r'https?://[^\s]+' # Links
	]
	citation_count = 0
	for pattern in citation_patterns:
	citation_count += len(re.findall(pattern, text, re.IGNORECASE))
	score += min(citation_count * 0.02, 0.2)

	# Coherence from NLP analysis (0.2)
	if nlp_analysis and 'coherence' in nlp_analysis:
	coherence = nlp_analysis['coherence']
	if isinstance(coherence, dict):
	coherence = coherence.get('score', 0.5)
	score += coherence * 0.2
	else:
	score += 0.1 # Assume moderate coherence

	return min(score, 1.0)

	def _calculate_authority(
	self,
	domain: str,
	pagerank: Optional[float],
	seo_score: Optional[float]
	) -> float:
	"""
	Calculate Authority score.

	Factors:
	- Known authoritative domain
	- PageRank simulation
	- SEO/technical quality
	"""
	score = 0.3 # Base score

	# Known domain authority (max 0.5)
	for known_domain, authority in self.AUTHORITATIVE_DOMAINS.items():
	if known_domain in domain:
	score = max(score, authority * 0.5 + 0.3)
	break

	# Check low-trust domains
	for low_trust_domain, low_score in self.LOW_TRUST_DOMAINS.items():
	if low_trust_domain in domain:
	score = min(score, low_score)
	break

	# PageRank contribution (max 0.3)
	if pagerank is not None:
	score += pagerank * 0.3
	else:
	score += 0.15 # Assume moderate pagerank

	# SEO score contribution (max 0.2)
	if seo_score is not None:
	score += seo_score * 0.2
	else:
	score += 0.1

	return min(score, 1.0)

	def _calculate_trust(
	self,
	domain: str,
	has_https: bool,
	fact_checks: Optional[List[Dict]],
	nlp_analysis: Optional[Dict]
	) -> float:
	"""
	Calculate Trust score.

	Factors:
	- HTTPS
	- Fact-check results
	- Bias score (low = better)
	- Known trustworthy domain
	"""
	score = 0.4 # Base score

	# HTTPS (0.1)
	if has_https:
	score += 0.1

	# Fact-check results (max 0.3)
	if fact_checks:
	positive_checks = sum(1 for fc in fact_checks
	if fc.get('rating', '').lower() in ['true', 'vrai', 'correct'])
	negative_checks = sum(1 for fc in fact_checks
	if fc.get('rating', '').lower() in ['false', 'faux', 'incorrect', 'pants-fire'])

	if positive_checks > 0:
	score += 0.2
	if negative_checks > 0:
	score -= 0.3

	# Bias score (max 0.2, lower bias = higher trust)
	if nlp_analysis:
	bias_data = nlp_analysis.get('bias_analysis', {})
	if isinstance(bias_data, dict):
	bias_score = bias_data.get('score', 0.3)
	else:
	bias_score = 0.3
	# Invert: low bias = high trust contribution
	score += (1 - bias_score) * 0.2
	else:
	score += 0.1

	# Known trustworthy domain (0.1)
	for known_domain in self.AUTHORITATIVE_DOMAINS:
	if known_domain in domain:
	score += 0.1
	break

	# Known low-trust domain (penalty)
	for low_trust_domain in self.LOW_TRUST_DOMAINS:
	if low_trust_domain in domain:
	score -= 0.3
	break

	return max(min(score, 1.0), 0.0)

	def explain_score(self, eeat: EEATScore, url: str) -> str:
	"""
	Generate human-readable explanation of E-E-A-T score.

	Args:
	eeat: EEATScore instance
	url: Source URL

	Returns:
	Formatted explanation string
	"""
	domain = self._extract_domain(url)

	explanations = []

	# Experience
	if eeat.experience >= 0.8:
	explanations.append(f"✅ Expérience élevée ({eeat.experience_pct}): Source établie depuis longtemps")
	elif eeat.experience >= 0.5:
	explanations.append(f"🔶 Expérience moyenne ({eeat.experience_pct}): Source modérément établie")
	else:
	explanations.append(f"⚠️ Expérience faible ({eeat.experience_pct}): Source récente ou peu connue")

	# Expertise
	if eeat.expertise >= 0.8:
	explanations.append(f"✅ Expertise élevée ({eeat.expertise_pct}): Contenu approfondi avec citations")
	elif eeat.expertise >= 0.5:
	explanations.append(f"🔶 Expertise moyenne ({eeat.expertise_pct}): Contenu standard")
	else:
	explanations.append(f"⚠️ Expertise faible ({eeat.expertise_pct}): Manque de profondeur")

	# Authority
	if eeat.authority >= 0.8:
	explanations.append(f"✅ Autorité élevée ({eeat.authority_pct}): Source très citée et reconnue")
	elif eeat.authority >= 0.5:
	explanations.append(f"🔶 Autorité moyenne ({eeat.authority_pct}): Source modérément reconnue")
	else:
	explanations.append(f"⚠️ Autorité faible ({eeat.authority_pct}): Peu de citations externes")

	# Trust
	if eeat.trust >= 0.8:
	explanations.append(f"✅ Confiance élevée ({eeat.trust_pct}): Faits vérifiés, pas de biais")
	elif eeat.trust >= 0.5:
	explanations.append(f"🔶 Confiance moyenne ({eeat.trust_pct}): Quelques signaux de confiance")
	else:
	explanations.append(f"⚠️ Confiance faible ({eeat.trust_pct}): Prudence recommandée")

	return "\n".join(explanations)


	# Test
	if __name__ == "__main__":
	calc = EEATCalculator()

	test_url = "https://www.lemonde.fr/politique/article/2024/01/06/trump.html"
	test_text = """
	Selon une étude du chercheur Dr. Martin, l'insurrection du 6 janvier 2021
	au Capitol a été un événement marquant. Notre reporter sur place a témoigné
	des événements. Les experts politiques analysent les conséquences.
	"""

	nlp_analysis = {
	'coherence': {'score': 0.8},
	'bias_analysis': {'score': 0.2}
	}

	eeat = calc.calculate(
	url=test_url,
	text=test_text,
	nlp_analysis=nlp_analysis,
	pagerank=0.7,
	has_https=True,
	author_identified=True
	)

	print("=== E-E-A-T Scores ===")
	print(f"Experience: {eeat.experience_pct}")
	print(f"Expertise: {eeat.expertise_pct}")
	print(f"Authority: {eeat.authority_pct}")
	print(f"Trust: {eeat.trust_pct}")
	print(f"Overall: {eeat.overall_pct}")
	print("\n=== Explanation ===")
	print(calc.explain_score(eeat, test_url))