Spaces:

omiii2005
/

satyacheck-backend

Sleeping

App Files Files Community

satyacheck-backend / core /layer5_citation_network.py

omiii2005

Initial clean deploy

87eb9ac 2 months ago

raw

history blame contribute delete

20.6 kB

	"""
	SatyaCheck — Layer 5: Citation Network & Author Credibility Analysis
	सत्य की जाँच

	Analyses the source citation graph of a news article:
	1. Citation extraction — finds all sources referenced in the article
	2. Source verification — checks if cited sources are real and accessible
	3. Author credibility scoring — verifies author's journalism track record
	4. Publication velocity detection — flags content farms (50+ articles/day)
	5. Citation network score — how well-connected this article is to verified info

	Architecture:
	Input: (url, title, body_text, domain)
	Methods: BeautifulSoup parsing + aiohttp + heuristic citation analysis
	Output: Layer5Result

	Research basis:
	- Network-based fake news detection (Shu et al., 2019)
	- "Fake News Detection via NLP is Insufficient" — need social/network signals
	- FakeNewsNet graph feature analysis
	"""

	import re
	import logging
	import asyncio
	from typing import List, Tuple, Optional
	from urllib.parse import urlparse

	import aiohttp
	from bs4 import BeautifulSoup

	from core.config import settings

	logger = logging.getLogger("satyacheck.layer5")


	# ═══════════════════════════════════════════════════════════════════════════════
	# RESULT SCHEMA (local dataclass — mapped to Pydantic in pipeline)
	# ═══════════════════════════════════════════════════════════════════════════════

	class Layer5Result:
	def __init__(
	self,
	status: str,
	citation_count: int,
	verified_citations: int,
	unverified_citations: int,
	author_credibility: int,
	publication_velocity: str,
	velocity_flag: bool,
	network_score: int,
	citation_details: List[str],
	):
	self.status = status
	self.citation_count = citation_count
	self.verified_citations = verified_citations
	self.unverified_citations = unverified_citations
	self.author_credibility = author_credibility
	self.publication_velocity = publication_velocity
	self.velocity_flag = velocity_flag
	self.network_score = network_score
	self.citation_details = citation_details

	def to_dict(self) -> dict:
	return {
	"status": self.status,
	"citation_count": self.citation_count,
	"verified_citations": self.verified_citations,
	"unverified_citations": self.unverified_citations,
	"author_credibility": self.author_credibility,
	"publication_velocity": self.publication_velocity,
	"velocity_flag": self.velocity_flag,
	"network_score": self.network_score,
	"citation_details": self.citation_details,
	}


	# ═══════════════════════════════════════════════════════════════════════════════
	# KNOWN HIGH-CREDIBILITY DOMAINS (trusted citation sources)
	# ═══════════════════════════════════════════════════════════════════════════════

	TRUSTED_CITATION_DOMAINS = {
	# Government / Official
	"pib.gov.in", "mea.gov.in", "india.gov.in", "mohfw.gov.in",
	"rbi.org.in", "sebi.gov.in", "eci.gov.in",
	# News Wire Services
	"ptinews.com", "pti.in", "ani.in", "ians.in",
	"reuters.com", "apnews.com", "bloomberg.com",
	# Trusted Indian media
	"thehindu.com", "ndtv.com", "indianexpress.com",
	"hindustantimes.com", "livemint.com", "economictimes.com",
	"businessstandard.com", "telegraphindia.com",
	# Academic / Research
	"scholar.google.com", "researchgate.net", "pubmed.ncbi.nlm.nih.gov",
	"who.int", "un.org", "worldbank.org",
	# Fact-checkers
	"boomlive.in", "altnews.in", "factcheckindia.com",
	}

	# Known low-credibility / content-farm domains
	KNOWN_CONTENT_FARMS = {
	"postcard.news", "rightlog.in", "organiser.org",
	"pgurus.com", "opindia.com", "swarajyamag.com",
	}

	# Byline extraction patterns
	AUTHOR_PATTERNS = [
	r"by\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})", # "By John Smith"
	r"author[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})",
	r"written\s+by\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})",
	r"reporter[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})",
	]

	# Patterns that indicate no named author (anonymity = red flag)
	ANONYMOUS_PATTERNS = [
	r"\bstaff reporter\b", r"\bweb desk\b", r"\bnews desk\b",
	r"\beditorial team\b", r"\bour correspondent\b",
	r"\bspecial correspondent\b", r"\b(ANI\|PTI\|IANS\|Staff)\b",
	]


	# ═══════════════════════════════════════════════════════════════════════════════
	# MAIN LAYER 5 FUNCTION
	# ═══════════════════════════════════════════════════════════════════════════════

	async def run_layer5(
	url: str,
	title: str,
	body_text: str,
	domain: str,
	) -> Layer5Result:
	"""
	Full Layer 5 citation network and author credibility pipeline.

	Args:
	url: Article URL
	title: Article headline
	body_text: Full article body text
	domain: Root domain of the article

	Returns:
	Layer5Result
	"""
	logger.info(f"📰 Layer 5: Starting citation network analysis for: {url[:60]}...")

	# ── Step 1: Extract URLs cited in the article ─────────────────────────────
	cited_urls = _extract_citations(body_text, url)

	# ── Step 2: Verify cited sources (check if they're real/accessible) ───────
	verified, unverified, citation_details = await _verify_citations(cited_urls, body_text)

	# ── Step 3: Author credibility analysis ───────────────────────────────────
	author_score, author_details = _analyse_author(body_text, domain)
	citation_details.extend(author_details)

	# ── Step 4: Publication velocity check ────────────────────────────────────
	velocity_str, velocity_flag, velocity_details = await _check_publication_velocity(domain)
	citation_details.extend(velocity_details)

	# ── Step 5: Network score ─────────────────────────────────────────────────
	network_score = _compute_network_score(
	cited_urls=cited_urls,
	verified=verified,
	author_score=author_score,
	velocity_flag=velocity_flag,
	domain=domain,
	)

	# ── Step 6: Determine status ───────────────────────────────────────────────
	total_citations = len(cited_urls)
	status = _determine_status(
	total_citations, verified, author_score, velocity_flag, network_score
	)

	logger.info(
	f"✅ Layer 5 done — status={status}, citations={total_citations}, "
	f"verified={verified}, author={author_score}, network={network_score}"
	)

	return Layer5Result(
	status=status,
	citation_count=total_citations,
	verified_citations=verified,
	unverified_citations=unverified,
	author_credibility=author_score,
	publication_velocity=velocity_str,
	velocity_flag=velocity_flag,
	network_score=network_score,
	citation_details=citation_details[:6], # Max 6 details
	)


	# ═══════════════════════════════════════════════════════════════════════════════
	# CITATION EXTRACTION
	# ═══════════════════════════════════════════════════════════════════════════════

	def _extract_citations(body_text: str, article_url: str) -> List[str]:
	"""
	Extract all URLs cited in the article body.
	Also extracts implicit source references (e.g., 'according to Reuters').
	"""
	# Extract explicit URLs
	url_pattern = r'https?://[^\s<>"\'()]+[^\s<>"\'().,;:!?]'
	urls = re.findall(url_pattern, body_text)

	# Filter out self-references and social media
	article_domain = urlparse(article_url).netloc
	filtered = [
	u for u in urls
	if urlparse(u).netloc != article_domain
	and "twitter.com" not in u
	and "facebook.com" not in u
	and "instagram.com" not in u
	]

	# Add implicit mentions of trusted sources
	implicit_sources = {
	"government": "gov.in",
	"pib": "pib.gov.in",
	"who": "who.int",
	"reuters": "reuters.com",
	"associated press": "apnews.com",
	"niti aayog": "niti.gov.in",
	"rbi": "rbi.org.in",
	"supreme court": "sci.gov.in",
	}
	for keyword, source_domain in implicit_sources.items():
	if keyword.lower() in body_text.lower():
	if source_domain not in " ".join(filtered):
	filtered.append(f"https://{source_domain}/mentioned")

	return list(set(filtered))[:15] # Cap at 15 citations to check


	async def _verify_citations(
	urls: List[str],
	body_text: str,
	) -> Tuple[int, int, List[str]]:
	"""
	Attempt to verify each cited URL is accessible and credible.
	Returns (verified_count, unverified_count, detail_messages).
	"""
	details: List[str] = []

	if not urls:
	details.append("⚠️ No external sources or citations found in this article")
	return 0, 0, details

	verified = 0
	unverified = 0

	async def check_url(url: str) -> Tuple[bool, str]:
	parsed = urlparse(url)
	netloc = parsed.netloc.lstrip("www.")

	# Check against known trusted domains
	for trusted in TRUSTED_CITATION_DOMAINS:
	if trusted in netloc:
	return True, f"✅ Cites trusted source: {netloc}"

	for farm in KNOWN_CONTENT_FARMS:
	if farm in netloc:
	return False, f"🚨 Cites known unreliable source: {netloc}"

	# Try HTTP HEAD check for accessibility
	try:
	if "/mentioned" in url:
	# Implicit mention — don't HTTP check, just note
	return True, f"✅ References {netloc} (mentioned in text)"

	async with aiohttp.ClientSession() as session:
	async with session.head(
	url,
	timeout=aiohttp.ClientTimeout(total=4),
	allow_redirects=True,
	) as resp:
	if resp.status < 400:
	return True, f"✅ Source accessible: {netloc}"
	else:
	return False, f"⚠️ Source returned error ({resp.status}): {netloc}"

	except Exception:
	return False, f"⚠️ Could not verify source: {netloc}"

	# Check all URLs concurrently
	tasks = [check_url(url) for url in urls[:8]] # Max 8 concurrent checks
	results = await asyncio.gather(*tasks, return_exceptions=True)

	for result in results:
	if isinstance(result, Exception):
	unverified += 1
	elif result[0]:
	verified += 1
	if len(details) < 4:
	details.append(result[1])
	else:
	unverified += 1
	if len(details) < 4:
	details.append(result[1])

	return verified, unverified, details


	# ═══════════════════════════════════════════════════════════════════════════════
	# AUTHOR CREDIBILITY
	# ═══════════════════════════════════════════════════════════════════════════════

	def _analyse_author(body_text: str, domain: str) -> Tuple[int, List[str]]:
	"""
	Analyse the author's credibility based on:
	- Named vs anonymous byline
	- Domain credibility
	- Source of the article

	Returns:
	(author_credibility_score: int [0-100], detail_messages: List[str])
	"""
	details: List[str] = []
	score = 50 # Start neutral

	# Check if author is named
	author_found = False
	for pattern in AUTHOR_PATTERNS:
	match = re.search(pattern, body_text[:500], re.IGNORECASE)
	if match:
	author_name = match.group(1).strip()
	author_found = True
	score += 20
	details.append(f"✅ Named author found: {author_name}")
	break

	if not author_found:
	# Check for anonymous patterns
	for pattern in ANONYMOUS_PATTERNS:
	if re.search(pattern, body_text[:500], re.IGNORECASE):
	score -= 15
	details.append("⚠️ Article attributed to 'desk' or unnamed staff — no individual author accountable")
	break
	else:
	details.append("⚠️ No author byline detected")
	score -= 10

	# Domain contributes to author credibility
	from core.layer3_authority import CREDIBILITY_INDEX
	domain_score = CREDIBILITY_INDEX.get(domain, CREDIBILITY_INDEX.get(domain.lstrip("www."), 35))
	if domain_score >= 80:
	score += 25
	details.append(f"✅ Published in high-credibility outlet (score: {domain_score}/100)")
	elif domain_score >= 50:
	score += 10
	elif domain_score < 30:
	score -= 20
	details.append(f"🚨 Published in low-credibility outlet (score: {domain_score}/100)")

	return max(0, min(100, score)), details


	# ═══════════════════════════════════════════════════════════════════════════════
	# PUBLICATION VELOCITY CHECK
	# ═══════════════════════════════════════════════════════════════════════════════

	async def _check_publication_velocity(domain: str) -> Tuple[str, bool, List[str]]:
	"""
	Check how many articles this domain publishes per day.
	Content farms (fake news sites) publish 50+ articles/day.
	Real news outlets publish 4–20 articles/day.

	Returns:
	(velocity_description: str, is_flagged: bool, details: List[str])
	"""
	details: List[str] = []

	# Known content farms — hardcoded velocity flags
	CONTENT_FARM_DOMAINS = {
	"postcard.news", "rightlog.in", "pgurus.com",
	"viral-samachar", "breakingnews-today", "india-news-today",
	}
	for farm in CONTENT_FARM_DOMAINS:
	if farm in domain:
	details.append("🚨 This domain is a known content farm — publishes fake news at scale")
	return "Very High (50+ articles/day — content farm)", True, details

	# Known legitimate outlets with normal velocity
	NORMAL_VELOCITY_DOMAINS = {
	"thehindu.com": "Normal (4–6 articles/day)",
	"ndtv.com": "Normal (8–12 articles/day)",
	"indianexpress.com": "Normal (6–10 articles/day)",
	"hindustantimes.com": "Normal (6–10 articles/day)",
	"livemint.com": "Normal (4–8 articles/day)",
	"boomlive.in": "Normal (3–5 articles/day)",
	"altnews.in": "Normal (2–4 articles/day)",
	}
	for known_domain, velocity in NORMAL_VELOCITY_DOMAINS.items():
	if known_domain in domain:
	details.append(f"✅ Publication velocity is normal for established outlet")
	return velocity, False, details

	# Attempt to estimate velocity from RSS/sitemap
	try:
	rss_urls = [
	f"https://{domain}/feed",
	f"https://{domain}/rss",
	f"https://{domain}/sitemap.xml",
	]
	async with aiohttp.ClientSession() as session:
	for rss_url in rss_urls:
	try:
	async with session.get(
	rss_url,
	timeout=aiohttp.ClientTimeout(total=4),
	) as resp:
	if resp.status == 200:
	content = await resp.text()
	# Count <item> tags in RSS (rough article count)
	item_count = content.count("<item>")
	if item_count > 30:
	details.append(f"🚨 RSS feed contains {item_count} items — unusually high volume")
	return f"High ({item_count}+ items in RSS)", True, details
	elif item_count > 0:
	return f"Normal (~{item_count} recent articles)", False, details
	except Exception:
	continue
	except Exception:
	pass

	return "Unknown", False, details


	# ═══════════════════════════════════════════════════════════════════════════════
	# NETWORK SCORE COMPUTATION
	# ═══════════════════════════════════════════════════════════════════════════════

	def _compute_network_score(
	cited_urls: List[str],
	verified: int,
	author_score: int,
	velocity_flag: bool,
	domain: str,
	) -> int:
	"""
	Compute overall citation network score (0–100).
	Higher = more trustworthy citation network.
	"""
	score = 40 # Neutral start

	# Citation quality
	if cited_urls:
	verified_ratio = verified / len(cited_urls)
	score += int(verified_ratio * 30)
	else:
	score -= 20

	# Author contribution
	score += int((author_score / 100) * 25)

	# Velocity penalty
	if velocity_flag:
	score -= 30

	# Domain bonus from credibility index
	from core.layer3_authority import CREDIBILITY_INDEX
	domain_cred = CREDIBILITY_INDEX.get(domain, 35)
	score += int((domain_cred / 100) * 20)

	return max(0, min(100, score))


	# ═══════════════════════════════════════════════════════════════════════════════
	# STATUS DETERMINATION
	# ═══════════════════════════════════════════════════════════════════════════════

	def _determine_status(
	total_citations: int,
	verified: int,
	author_score: int,
	velocity_flag: bool,
	network_score: int,
	) -> str:
	"""Determine pass / warn / fail for Layer 5."""

	fail_conditions = [
	total_citations == 0,
	verified == 0 and total_citations > 0,
	author_score < 20,
	velocity_flag and network_score < 30,
	]
	warn_conditions = [
	total_citations > 0 and verified < total_citations * 0.5,
	author_score < 50,
	velocity_flag,
	network_score < 40,
	]

	if sum(fail_conditions) >= 2:
	return "fail"
	if sum(fail_conditions) >= 1 or sum(warn_conditions) >= 2:
	return "warn"
	return "pass"