satyacheck-backend / core /layer5_citation_network.py
omiii2005's picture
Initial clean deploy
87eb9ac
"""
SatyaCheck β€” Layer 5: Citation Network & Author Credibility Analysis
ΰ€Έΰ€€ΰ₯ΰ€― ΰ€•ΰ₯€ ΰ€œΰ€Ύΰ€ΰ€š
Analyses the source citation graph of a news article:
1. Citation extraction β€” finds all sources referenced in the article
2. Source verification β€” checks if cited sources are real and accessible
3. Author credibility scoring β€” verifies author's journalism track record
4. Publication velocity detection β€” flags content farms (50+ articles/day)
5. Citation network score β€” how well-connected this article is to verified info
Architecture:
Input: (url, title, body_text, domain)
Methods: BeautifulSoup parsing + aiohttp + heuristic citation analysis
Output: Layer5Result
Research basis:
- Network-based fake news detection (Shu et al., 2019)
- "Fake News Detection via NLP is Insufficient" β€” need social/network signals
- FakeNewsNet graph feature analysis
"""
import re
import logging
import asyncio
from typing import List, Tuple, Optional
from urllib.parse import urlparse
import aiohttp
from bs4 import BeautifulSoup
from core.config import settings
logger = logging.getLogger("satyacheck.layer5")
# ═══════════════════════════════════════════════════════════════════════════════
# RESULT SCHEMA (local dataclass β€” mapped to Pydantic in pipeline)
# ═══════════════════════════════════════════════════════════════════════════════
class Layer5Result:
def __init__(
self,
status: str,
citation_count: int,
verified_citations: int,
unverified_citations: int,
author_credibility: int,
publication_velocity: str,
velocity_flag: bool,
network_score: int,
citation_details: List[str],
):
self.status = status
self.citation_count = citation_count
self.verified_citations = verified_citations
self.unverified_citations = unverified_citations
self.author_credibility = author_credibility
self.publication_velocity = publication_velocity
self.velocity_flag = velocity_flag
self.network_score = network_score
self.citation_details = citation_details
def to_dict(self) -> dict:
return {
"status": self.status,
"citation_count": self.citation_count,
"verified_citations": self.verified_citations,
"unverified_citations": self.unverified_citations,
"author_credibility": self.author_credibility,
"publication_velocity": self.publication_velocity,
"velocity_flag": self.velocity_flag,
"network_score": self.network_score,
"citation_details": self.citation_details,
}
# ═══════════════════════════════════════════════════════════════════════════════
# KNOWN HIGH-CREDIBILITY DOMAINS (trusted citation sources)
# ═══════════════════════════════════════════════════════════════════════════════
TRUSTED_CITATION_DOMAINS = {
# Government / Official
"pib.gov.in", "mea.gov.in", "india.gov.in", "mohfw.gov.in",
"rbi.org.in", "sebi.gov.in", "eci.gov.in",
# News Wire Services
"ptinews.com", "pti.in", "ani.in", "ians.in",
"reuters.com", "apnews.com", "bloomberg.com",
# Trusted Indian media
"thehindu.com", "ndtv.com", "indianexpress.com",
"hindustantimes.com", "livemint.com", "economictimes.com",
"businessstandard.com", "telegraphindia.com",
# Academic / Research
"scholar.google.com", "researchgate.net", "pubmed.ncbi.nlm.nih.gov",
"who.int", "un.org", "worldbank.org",
# Fact-checkers
"boomlive.in", "altnews.in", "factcheckindia.com",
}
# Known low-credibility / content-farm domains
KNOWN_CONTENT_FARMS = {
"postcard.news", "rightlog.in", "organiser.org",
"pgurus.com", "opindia.com", "swarajyamag.com",
}
# Byline extraction patterns
AUTHOR_PATTERNS = [
r"by\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})", # "By John Smith"
r"author[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})",
r"written\s+by\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})",
r"reporter[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})",
]
# Patterns that indicate no named author (anonymity = red flag)
ANONYMOUS_PATTERNS = [
r"\bstaff reporter\b", r"\bweb desk\b", r"\bnews desk\b",
r"\beditorial team\b", r"\bour correspondent\b",
r"\bspecial correspondent\b", r"\b(ANI|PTI|IANS|Staff)\b",
]
# ═══════════════════════════════════════════════════════════════════════════════
# MAIN LAYER 5 FUNCTION
# ═══════════════════════════════════════════════════════════════════════════════
async def run_layer5(
url: str,
title: str,
body_text: str,
domain: str,
) -> Layer5Result:
"""
Full Layer 5 citation network and author credibility pipeline.
Args:
url: Article URL
title: Article headline
body_text: Full article body text
domain: Root domain of the article
Returns:
Layer5Result
"""
logger.info(f"πŸ“° Layer 5: Starting citation network analysis for: {url[:60]}...")
# ── Step 1: Extract URLs cited in the article ─────────────────────────────
cited_urls = _extract_citations(body_text, url)
# ── Step 2: Verify cited sources (check if they're real/accessible) ───────
verified, unverified, citation_details = await _verify_citations(cited_urls, body_text)
# ── Step 3: Author credibility analysis ───────────────────────────────────
author_score, author_details = _analyse_author(body_text, domain)
citation_details.extend(author_details)
# ── Step 4: Publication velocity check ────────────────────────────────────
velocity_str, velocity_flag, velocity_details = await _check_publication_velocity(domain)
citation_details.extend(velocity_details)
# ── Step 5: Network score ─────────────────────────────────────────────────
network_score = _compute_network_score(
cited_urls=cited_urls,
verified=verified,
author_score=author_score,
velocity_flag=velocity_flag,
domain=domain,
)
# ── Step 6: Determine status ───────────────────────────────────────────────
total_citations = len(cited_urls)
status = _determine_status(
total_citations, verified, author_score, velocity_flag, network_score
)
logger.info(
f"βœ… Layer 5 done β€” status={status}, citations={total_citations}, "
f"verified={verified}, author={author_score}, network={network_score}"
)
return Layer5Result(
status=status,
citation_count=total_citations,
verified_citations=verified,
unverified_citations=unverified,
author_credibility=author_score,
publication_velocity=velocity_str,
velocity_flag=velocity_flag,
network_score=network_score,
citation_details=citation_details[:6], # Max 6 details
)
# ═══════════════════════════════════════════════════════════════════════════════
# CITATION EXTRACTION
# ═══════════════════════════════════════════════════════════════════════════════
def _extract_citations(body_text: str, article_url: str) -> List[str]:
"""
Extract all URLs cited in the article body.
Also extracts implicit source references (e.g., 'according to Reuters').
"""
# Extract explicit URLs
url_pattern = r'https?://[^\s<>"\'()]+[^\s<>"\'().,;:!?]'
urls = re.findall(url_pattern, body_text)
# Filter out self-references and social media
article_domain = urlparse(article_url).netloc
filtered = [
u for u in urls
if urlparse(u).netloc != article_domain
and "twitter.com" not in u
and "facebook.com" not in u
and "instagram.com" not in u
]
# Add implicit mentions of trusted sources
implicit_sources = {
"government": "gov.in",
"pib": "pib.gov.in",
"who": "who.int",
"reuters": "reuters.com",
"associated press": "apnews.com",
"niti aayog": "niti.gov.in",
"rbi": "rbi.org.in",
"supreme court": "sci.gov.in",
}
for keyword, source_domain in implicit_sources.items():
if keyword.lower() in body_text.lower():
if source_domain not in " ".join(filtered):
filtered.append(f"https://{source_domain}/mentioned")
return list(set(filtered))[:15] # Cap at 15 citations to check
async def _verify_citations(
urls: List[str],
body_text: str,
) -> Tuple[int, int, List[str]]:
"""
Attempt to verify each cited URL is accessible and credible.
Returns (verified_count, unverified_count, detail_messages).
"""
details: List[str] = []
if not urls:
details.append("⚠️ No external sources or citations found in this article")
return 0, 0, details
verified = 0
unverified = 0
async def check_url(url: str) -> Tuple[bool, str]:
parsed = urlparse(url)
netloc = parsed.netloc.lstrip("www.")
# Check against known trusted domains
for trusted in TRUSTED_CITATION_DOMAINS:
if trusted in netloc:
return True, f"βœ… Cites trusted source: {netloc}"
for farm in KNOWN_CONTENT_FARMS:
if farm in netloc:
return False, f"🚨 Cites known unreliable source: {netloc}"
# Try HTTP HEAD check for accessibility
try:
if "/mentioned" in url:
# Implicit mention β€” don't HTTP check, just note
return True, f"βœ… References {netloc} (mentioned in text)"
async with aiohttp.ClientSession() as session:
async with session.head(
url,
timeout=aiohttp.ClientTimeout(total=4),
allow_redirects=True,
) as resp:
if resp.status < 400:
return True, f"βœ… Source accessible: {netloc}"
else:
return False, f"⚠️ Source returned error ({resp.status}): {netloc}"
except Exception:
return False, f"⚠️ Could not verify source: {netloc}"
# Check all URLs concurrently
tasks = [check_url(url) for url in urls[:8]] # Max 8 concurrent checks
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
unverified += 1
elif result[0]:
verified += 1
if len(details) < 4:
details.append(result[1])
else:
unverified += 1
if len(details) < 4:
details.append(result[1])
return verified, unverified, details
# ═══════════════════════════════════════════════════════════════════════════════
# AUTHOR CREDIBILITY
# ═══════════════════════════════════════════════════════════════════════════════
def _analyse_author(body_text: str, domain: str) -> Tuple[int, List[str]]:
"""
Analyse the author's credibility based on:
- Named vs anonymous byline
- Domain credibility
- Source of the article
Returns:
(author_credibility_score: int [0-100], detail_messages: List[str])
"""
details: List[str] = []
score = 50 # Start neutral
# Check if author is named
author_found = False
for pattern in AUTHOR_PATTERNS:
match = re.search(pattern, body_text[:500], re.IGNORECASE)
if match:
author_name = match.group(1).strip()
author_found = True
score += 20
details.append(f"βœ… Named author found: {author_name}")
break
if not author_found:
# Check for anonymous patterns
for pattern in ANONYMOUS_PATTERNS:
if re.search(pattern, body_text[:500], re.IGNORECASE):
score -= 15
details.append("⚠️ Article attributed to 'desk' or unnamed staff β€” no individual author accountable")
break
else:
details.append("⚠️ No author byline detected")
score -= 10
# Domain contributes to author credibility
from core.layer3_authority import CREDIBILITY_INDEX
domain_score = CREDIBILITY_INDEX.get(domain, CREDIBILITY_INDEX.get(domain.lstrip("www."), 35))
if domain_score >= 80:
score += 25
details.append(f"βœ… Published in high-credibility outlet (score: {domain_score}/100)")
elif domain_score >= 50:
score += 10
elif domain_score < 30:
score -= 20
details.append(f"🚨 Published in low-credibility outlet (score: {domain_score}/100)")
return max(0, min(100, score)), details
# ═══════════════════════════════════════════════════════════════════════════════
# PUBLICATION VELOCITY CHECK
# ═══════════════════════════════════════════════════════════════════════════════
async def _check_publication_velocity(domain: str) -> Tuple[str, bool, List[str]]:
"""
Check how many articles this domain publishes per day.
Content farms (fake news sites) publish 50+ articles/day.
Real news outlets publish 4–20 articles/day.
Returns:
(velocity_description: str, is_flagged: bool, details: List[str])
"""
details: List[str] = []
# Known content farms β€” hardcoded velocity flags
CONTENT_FARM_DOMAINS = {
"postcard.news", "rightlog.in", "pgurus.com",
"viral-samachar", "breakingnews-today", "india-news-today",
}
for farm in CONTENT_FARM_DOMAINS:
if farm in domain:
details.append("🚨 This domain is a known content farm β€” publishes fake news at scale")
return "Very High (50+ articles/day β€” content farm)", True, details
# Known legitimate outlets with normal velocity
NORMAL_VELOCITY_DOMAINS = {
"thehindu.com": "Normal (4–6 articles/day)",
"ndtv.com": "Normal (8–12 articles/day)",
"indianexpress.com": "Normal (6–10 articles/day)",
"hindustantimes.com": "Normal (6–10 articles/day)",
"livemint.com": "Normal (4–8 articles/day)",
"boomlive.in": "Normal (3–5 articles/day)",
"altnews.in": "Normal (2–4 articles/day)",
}
for known_domain, velocity in NORMAL_VELOCITY_DOMAINS.items():
if known_domain in domain:
details.append(f"βœ… Publication velocity is normal for established outlet")
return velocity, False, details
# Attempt to estimate velocity from RSS/sitemap
try:
rss_urls = [
f"https://{domain}/feed",
f"https://{domain}/rss",
f"https://{domain}/sitemap.xml",
]
async with aiohttp.ClientSession() as session:
for rss_url in rss_urls:
try:
async with session.get(
rss_url,
timeout=aiohttp.ClientTimeout(total=4),
) as resp:
if resp.status == 200:
content = await resp.text()
# Count <item> tags in RSS (rough article count)
item_count = content.count("<item>")
if item_count > 30:
details.append(f"🚨 RSS feed contains {item_count} items β€” unusually high volume")
return f"High ({item_count}+ items in RSS)", True, details
elif item_count > 0:
return f"Normal (~{item_count} recent articles)", False, details
except Exception:
continue
except Exception:
pass
return "Unknown", False, details
# ═══════════════════════════════════════════════════════════════════════════════
# NETWORK SCORE COMPUTATION
# ═══════════════════════════════════════════════════════════════════════════════
def _compute_network_score(
cited_urls: List[str],
verified: int,
author_score: int,
velocity_flag: bool,
domain: str,
) -> int:
"""
Compute overall citation network score (0–100).
Higher = more trustworthy citation network.
"""
score = 40 # Neutral start
# Citation quality
if cited_urls:
verified_ratio = verified / len(cited_urls)
score += int(verified_ratio * 30)
else:
score -= 20
# Author contribution
score += int((author_score / 100) * 25)
# Velocity penalty
if velocity_flag:
score -= 30
# Domain bonus from credibility index
from core.layer3_authority import CREDIBILITY_INDEX
domain_cred = CREDIBILITY_INDEX.get(domain, 35)
score += int((domain_cred / 100) * 20)
return max(0, min(100, score))
# ═══════════════════════════════════════════════════════════════════════════════
# STATUS DETERMINATION
# ═══════════════════════════════════════════════════════════════════════════════
def _determine_status(
total_citations: int,
verified: int,
author_score: int,
velocity_flag: bool,
network_score: int,
) -> str:
"""Determine pass / warn / fail for Layer 5."""
fail_conditions = [
total_citations == 0,
verified == 0 and total_citations > 0,
author_score < 20,
velocity_flag and network_score < 30,
]
warn_conditions = [
total_citations > 0 and verified < total_citations * 0.5,
author_score < 50,
velocity_flag,
network_score < 40,
]
if sum(fail_conditions) >= 2:
return "fail"
if sum(fail_conditions) >= 1 or sum(warn_conditions) >= 2:
return "warn"
return "pass"