Spaces:
Sleeping
Sleeping
| """ | |
| SatyaCheck β Layer 5: Citation Network & Author Credibility Analysis | |
| ΰ€Έΰ€€ΰ₯ΰ€― ΰ€ΰ₯ ΰ€ΰ€Ύΰ€ΰ€ | |
| Analyses the source citation graph of a news article: | |
| 1. Citation extraction β finds all sources referenced in the article | |
| 2. Source verification β checks if cited sources are real and accessible | |
| 3. Author credibility scoring β verifies author's journalism track record | |
| 4. Publication velocity detection β flags content farms (50+ articles/day) | |
| 5. Citation network score β how well-connected this article is to verified info | |
| Architecture: | |
| Input: (url, title, body_text, domain) | |
| Methods: BeautifulSoup parsing + aiohttp + heuristic citation analysis | |
| Output: Layer5Result | |
| Research basis: | |
| - Network-based fake news detection (Shu et al., 2019) | |
| - "Fake News Detection via NLP is Insufficient" β need social/network signals | |
| - FakeNewsNet graph feature analysis | |
| """ | |
| import re | |
| import logging | |
| import asyncio | |
| from typing import List, Tuple, Optional | |
| from urllib.parse import urlparse | |
| import aiohttp | |
| from bs4 import BeautifulSoup | |
| from core.config import settings | |
| logger = logging.getLogger("satyacheck.layer5") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # RESULT SCHEMA (local dataclass β mapped to Pydantic in pipeline) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class Layer5Result: | |
| def __init__( | |
| self, | |
| status: str, | |
| citation_count: int, | |
| verified_citations: int, | |
| unverified_citations: int, | |
| author_credibility: int, | |
| publication_velocity: str, | |
| velocity_flag: bool, | |
| network_score: int, | |
| citation_details: List[str], | |
| ): | |
| self.status = status | |
| self.citation_count = citation_count | |
| self.verified_citations = verified_citations | |
| self.unverified_citations = unverified_citations | |
| self.author_credibility = author_credibility | |
| self.publication_velocity = publication_velocity | |
| self.velocity_flag = velocity_flag | |
| self.network_score = network_score | |
| self.citation_details = citation_details | |
| def to_dict(self) -> dict: | |
| return { | |
| "status": self.status, | |
| "citation_count": self.citation_count, | |
| "verified_citations": self.verified_citations, | |
| "unverified_citations": self.unverified_citations, | |
| "author_credibility": self.author_credibility, | |
| "publication_velocity": self.publication_velocity, | |
| "velocity_flag": self.velocity_flag, | |
| "network_score": self.network_score, | |
| "citation_details": self.citation_details, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # KNOWN HIGH-CREDIBILITY DOMAINS (trusted citation sources) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TRUSTED_CITATION_DOMAINS = { | |
| # Government / Official | |
| "pib.gov.in", "mea.gov.in", "india.gov.in", "mohfw.gov.in", | |
| "rbi.org.in", "sebi.gov.in", "eci.gov.in", | |
| # News Wire Services | |
| "ptinews.com", "pti.in", "ani.in", "ians.in", | |
| "reuters.com", "apnews.com", "bloomberg.com", | |
| # Trusted Indian media | |
| "thehindu.com", "ndtv.com", "indianexpress.com", | |
| "hindustantimes.com", "livemint.com", "economictimes.com", | |
| "businessstandard.com", "telegraphindia.com", | |
| # Academic / Research | |
| "scholar.google.com", "researchgate.net", "pubmed.ncbi.nlm.nih.gov", | |
| "who.int", "un.org", "worldbank.org", | |
| # Fact-checkers | |
| "boomlive.in", "altnews.in", "factcheckindia.com", | |
| } | |
| # Known low-credibility / content-farm domains | |
| KNOWN_CONTENT_FARMS = { | |
| "postcard.news", "rightlog.in", "organiser.org", | |
| "pgurus.com", "opindia.com", "swarajyamag.com", | |
| } | |
| # Byline extraction patterns | |
| AUTHOR_PATTERNS = [ | |
| r"by\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})", # "By John Smith" | |
| r"author[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})", | |
| r"written\s+by\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})", | |
| r"reporter[:\s]+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})", | |
| ] | |
| # Patterns that indicate no named author (anonymity = red flag) | |
| ANONYMOUS_PATTERNS = [ | |
| r"\bstaff reporter\b", r"\bweb desk\b", r"\bnews desk\b", | |
| r"\beditorial team\b", r"\bour correspondent\b", | |
| r"\bspecial correspondent\b", r"\b(ANI|PTI|IANS|Staff)\b", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN LAYER 5 FUNCTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def run_layer5( | |
| url: str, | |
| title: str, | |
| body_text: str, | |
| domain: str, | |
| ) -> Layer5Result: | |
| """ | |
| Full Layer 5 citation network and author credibility pipeline. | |
| Args: | |
| url: Article URL | |
| title: Article headline | |
| body_text: Full article body text | |
| domain: Root domain of the article | |
| Returns: | |
| Layer5Result | |
| """ | |
| logger.info(f"π° Layer 5: Starting citation network analysis for: {url[:60]}...") | |
| # ββ Step 1: Extract URLs cited in the article βββββββββββββββββββββββββββββ | |
| cited_urls = _extract_citations(body_text, url) | |
| # ββ Step 2: Verify cited sources (check if they're real/accessible) βββββββ | |
| verified, unverified, citation_details = await _verify_citations(cited_urls, body_text) | |
| # ββ Step 3: Author credibility analysis βββββββββββββββββββββββββββββββββββ | |
| author_score, author_details = _analyse_author(body_text, domain) | |
| citation_details.extend(author_details) | |
| # ββ Step 4: Publication velocity check ββββββββββββββββββββββββββββββββββββ | |
| velocity_str, velocity_flag, velocity_details = await _check_publication_velocity(domain) | |
| citation_details.extend(velocity_details) | |
| # ββ Step 5: Network score βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| network_score = _compute_network_score( | |
| cited_urls=cited_urls, | |
| verified=verified, | |
| author_score=author_score, | |
| velocity_flag=velocity_flag, | |
| domain=domain, | |
| ) | |
| # ββ Step 6: Determine status βββββββββββββββββββββββββββββββββββββββββββββββ | |
| total_citations = len(cited_urls) | |
| status = _determine_status( | |
| total_citations, verified, author_score, velocity_flag, network_score | |
| ) | |
| logger.info( | |
| f"β Layer 5 done β status={status}, citations={total_citations}, " | |
| f"verified={verified}, author={author_score}, network={network_score}" | |
| ) | |
| return Layer5Result( | |
| status=status, | |
| citation_count=total_citations, | |
| verified_citations=verified, | |
| unverified_citations=unverified, | |
| author_credibility=author_score, | |
| publication_velocity=velocity_str, | |
| velocity_flag=velocity_flag, | |
| network_score=network_score, | |
| citation_details=citation_details[:6], # Max 6 details | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CITATION EXTRACTION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _extract_citations(body_text: str, article_url: str) -> List[str]: | |
| """ | |
| Extract all URLs cited in the article body. | |
| Also extracts implicit source references (e.g., 'according to Reuters'). | |
| """ | |
| # Extract explicit URLs | |
| url_pattern = r'https?://[^\s<>"\'()]+[^\s<>"\'().,;:!?]' | |
| urls = re.findall(url_pattern, body_text) | |
| # Filter out self-references and social media | |
| article_domain = urlparse(article_url).netloc | |
| filtered = [ | |
| u for u in urls | |
| if urlparse(u).netloc != article_domain | |
| and "twitter.com" not in u | |
| and "facebook.com" not in u | |
| and "instagram.com" not in u | |
| ] | |
| # Add implicit mentions of trusted sources | |
| implicit_sources = { | |
| "government": "gov.in", | |
| "pib": "pib.gov.in", | |
| "who": "who.int", | |
| "reuters": "reuters.com", | |
| "associated press": "apnews.com", | |
| "niti aayog": "niti.gov.in", | |
| "rbi": "rbi.org.in", | |
| "supreme court": "sci.gov.in", | |
| } | |
| for keyword, source_domain in implicit_sources.items(): | |
| if keyword.lower() in body_text.lower(): | |
| if source_domain not in " ".join(filtered): | |
| filtered.append(f"https://{source_domain}/mentioned") | |
| return list(set(filtered))[:15] # Cap at 15 citations to check | |
| async def _verify_citations( | |
| urls: List[str], | |
| body_text: str, | |
| ) -> Tuple[int, int, List[str]]: | |
| """ | |
| Attempt to verify each cited URL is accessible and credible. | |
| Returns (verified_count, unverified_count, detail_messages). | |
| """ | |
| details: List[str] = [] | |
| if not urls: | |
| details.append("β οΈ No external sources or citations found in this article") | |
| return 0, 0, details | |
| verified = 0 | |
| unverified = 0 | |
| async def check_url(url: str) -> Tuple[bool, str]: | |
| parsed = urlparse(url) | |
| netloc = parsed.netloc.lstrip("www.") | |
| # Check against known trusted domains | |
| for trusted in TRUSTED_CITATION_DOMAINS: | |
| if trusted in netloc: | |
| return True, f"β Cites trusted source: {netloc}" | |
| for farm in KNOWN_CONTENT_FARMS: | |
| if farm in netloc: | |
| return False, f"π¨ Cites known unreliable source: {netloc}" | |
| # Try HTTP HEAD check for accessibility | |
| try: | |
| if "/mentioned" in url: | |
| # Implicit mention β don't HTTP check, just note | |
| return True, f"β References {netloc} (mentioned in text)" | |
| async with aiohttp.ClientSession() as session: | |
| async with session.head( | |
| url, | |
| timeout=aiohttp.ClientTimeout(total=4), | |
| allow_redirects=True, | |
| ) as resp: | |
| if resp.status < 400: | |
| return True, f"β Source accessible: {netloc}" | |
| else: | |
| return False, f"β οΈ Source returned error ({resp.status}): {netloc}" | |
| except Exception: | |
| return False, f"β οΈ Could not verify source: {netloc}" | |
| # Check all URLs concurrently | |
| tasks = [check_url(url) for url in urls[:8]] # Max 8 concurrent checks | |
| results = await asyncio.gather(*tasks, return_exceptions=True) | |
| for result in results: | |
| if isinstance(result, Exception): | |
| unverified += 1 | |
| elif result[0]: | |
| verified += 1 | |
| if len(details) < 4: | |
| details.append(result[1]) | |
| else: | |
| unverified += 1 | |
| if len(details) < 4: | |
| details.append(result[1]) | |
| return verified, unverified, details | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # AUTHOR CREDIBILITY | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _analyse_author(body_text: str, domain: str) -> Tuple[int, List[str]]: | |
| """ | |
| Analyse the author's credibility based on: | |
| - Named vs anonymous byline | |
| - Domain credibility | |
| - Source of the article | |
| Returns: | |
| (author_credibility_score: int [0-100], detail_messages: List[str]) | |
| """ | |
| details: List[str] = [] | |
| score = 50 # Start neutral | |
| # Check if author is named | |
| author_found = False | |
| for pattern in AUTHOR_PATTERNS: | |
| match = re.search(pattern, body_text[:500], re.IGNORECASE) | |
| if match: | |
| author_name = match.group(1).strip() | |
| author_found = True | |
| score += 20 | |
| details.append(f"β Named author found: {author_name}") | |
| break | |
| if not author_found: | |
| # Check for anonymous patterns | |
| for pattern in ANONYMOUS_PATTERNS: | |
| if re.search(pattern, body_text[:500], re.IGNORECASE): | |
| score -= 15 | |
| details.append("β οΈ Article attributed to 'desk' or unnamed staff β no individual author accountable") | |
| break | |
| else: | |
| details.append("β οΈ No author byline detected") | |
| score -= 10 | |
| # Domain contributes to author credibility | |
| from core.layer3_authority import CREDIBILITY_INDEX | |
| domain_score = CREDIBILITY_INDEX.get(domain, CREDIBILITY_INDEX.get(domain.lstrip("www."), 35)) | |
| if domain_score >= 80: | |
| score += 25 | |
| details.append(f"β Published in high-credibility outlet (score: {domain_score}/100)") | |
| elif domain_score >= 50: | |
| score += 10 | |
| elif domain_score < 30: | |
| score -= 20 | |
| details.append(f"π¨ Published in low-credibility outlet (score: {domain_score}/100)") | |
| return max(0, min(100, score)), details | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PUBLICATION VELOCITY CHECK | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def _check_publication_velocity(domain: str) -> Tuple[str, bool, List[str]]: | |
| """ | |
| Check how many articles this domain publishes per day. | |
| Content farms (fake news sites) publish 50+ articles/day. | |
| Real news outlets publish 4β20 articles/day. | |
| Returns: | |
| (velocity_description: str, is_flagged: bool, details: List[str]) | |
| """ | |
| details: List[str] = [] | |
| # Known content farms β hardcoded velocity flags | |
| CONTENT_FARM_DOMAINS = { | |
| "postcard.news", "rightlog.in", "pgurus.com", | |
| "viral-samachar", "breakingnews-today", "india-news-today", | |
| } | |
| for farm in CONTENT_FARM_DOMAINS: | |
| if farm in domain: | |
| details.append("π¨ This domain is a known content farm β publishes fake news at scale") | |
| return "Very High (50+ articles/day β content farm)", True, details | |
| # Known legitimate outlets with normal velocity | |
| NORMAL_VELOCITY_DOMAINS = { | |
| "thehindu.com": "Normal (4β6 articles/day)", | |
| "ndtv.com": "Normal (8β12 articles/day)", | |
| "indianexpress.com": "Normal (6β10 articles/day)", | |
| "hindustantimes.com": "Normal (6β10 articles/day)", | |
| "livemint.com": "Normal (4β8 articles/day)", | |
| "boomlive.in": "Normal (3β5 articles/day)", | |
| "altnews.in": "Normal (2β4 articles/day)", | |
| } | |
| for known_domain, velocity in NORMAL_VELOCITY_DOMAINS.items(): | |
| if known_domain in domain: | |
| details.append(f"β Publication velocity is normal for established outlet") | |
| return velocity, False, details | |
| # Attempt to estimate velocity from RSS/sitemap | |
| try: | |
| rss_urls = [ | |
| f"https://{domain}/feed", | |
| f"https://{domain}/rss", | |
| f"https://{domain}/sitemap.xml", | |
| ] | |
| async with aiohttp.ClientSession() as session: | |
| for rss_url in rss_urls: | |
| try: | |
| async with session.get( | |
| rss_url, | |
| timeout=aiohttp.ClientTimeout(total=4), | |
| ) as resp: | |
| if resp.status == 200: | |
| content = await resp.text() | |
| # Count <item> tags in RSS (rough article count) | |
| item_count = content.count("<item>") | |
| if item_count > 30: | |
| details.append(f"π¨ RSS feed contains {item_count} items β unusually high volume") | |
| return f"High ({item_count}+ items in RSS)", True, details | |
| elif item_count > 0: | |
| return f"Normal (~{item_count} recent articles)", False, details | |
| except Exception: | |
| continue | |
| except Exception: | |
| pass | |
| return "Unknown", False, details | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # NETWORK SCORE COMPUTATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _compute_network_score( | |
| cited_urls: List[str], | |
| verified: int, | |
| author_score: int, | |
| velocity_flag: bool, | |
| domain: str, | |
| ) -> int: | |
| """ | |
| Compute overall citation network score (0β100). | |
| Higher = more trustworthy citation network. | |
| """ | |
| score = 40 # Neutral start | |
| # Citation quality | |
| if cited_urls: | |
| verified_ratio = verified / len(cited_urls) | |
| score += int(verified_ratio * 30) | |
| else: | |
| score -= 20 | |
| # Author contribution | |
| score += int((author_score / 100) * 25) | |
| # Velocity penalty | |
| if velocity_flag: | |
| score -= 30 | |
| # Domain bonus from credibility index | |
| from core.layer3_authority import CREDIBILITY_INDEX | |
| domain_cred = CREDIBILITY_INDEX.get(domain, 35) | |
| score += int((domain_cred / 100) * 20) | |
| return max(0, min(100, score)) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STATUS DETERMINATION | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _determine_status( | |
| total_citations: int, | |
| verified: int, | |
| author_score: int, | |
| velocity_flag: bool, | |
| network_score: int, | |
| ) -> str: | |
| """Determine pass / warn / fail for Layer 5.""" | |
| fail_conditions = [ | |
| total_citations == 0, | |
| verified == 0 and total_citations > 0, | |
| author_score < 20, | |
| velocity_flag and network_score < 30, | |
| ] | |
| warn_conditions = [ | |
| total_citations > 0 and verified < total_citations * 0.5, | |
| author_score < 50, | |
| velocity_flag, | |
| network_score < 40, | |
| ] | |
| if sum(fail_conditions) >= 2: | |
| return "fail" | |
| if sum(fail_conditions) >= 1 or sum(warn_conditions) >= 2: | |
| return "warn" | |
| return "pass" | |