Spaces:

omiii2005
/

satyacheck-backend

Sleeping

App Files Files Community

satyacheck-backend / core /layer4_xai.py

omiii2005

Initial clean deploy

87eb9ac 2 months ago

raw

history blame contribute delete

22.3 kB

	"""
	SatyaCheck — Layer 4: Explainable AI (XAI)
	सत्य की जाँच

	Synthesises results from Layers 1–3 into:
	1. Final risk verdict (TRUSTWORTHY / BE CAREFUL / MISLEADING / FAKE NEWS)
	2. Confidence score (0–100%)
	3. SHAP feature attributions (which factors drove the verdict)
	4. Plain-English explanation (for non-technical users)
	5. Transparent reasoning (key reasons the article was flagged)
	6. Actionable recommendation

	Architecture:
	Input: Layer1Result, Layer2Result, Layer3Result
	Method: Weighted ensemble scoring + SHAP-style feature attribution
	Output: Layer4Result
	"""

	import logging
	from typing import List, Tuple

	import numpy as np

	from core.schemas import (
	Layer1Result,
	Layer2Result,
	Layer3Result,
	Layer4Result,
	RiskLevel,
	SHAPFeature,
	)
	from core.config import settings

	logger = logging.getLogger("satyacheck.layer4")


	# ═══════════════════════════════════════════════════════════════════════════════
	# ENSEMBLE WEIGHTS
	# These weights were calibrated on the LIAR and FakeNewsNet benchmark datasets.
	# ═══════════════════════════════════════════════════════════════════════════════

	WEIGHT_SEMANTIC_CONFIDENCE = 0.30 # Layer 1 confidence
	WEIGHT_STANCE = 0.20 # Headline vs body mismatch
	WEIGHT_EMOTIONAL_BIAS = 0.10 # Fear/anger language
	WEIGHT_CLICKBAIT = 0.05 # Clickbait headline
	WEIGHT_DEEPFAKE = 0.10 # Image manipulation
	WEIGHT_IMAGE_REUSE = 0.05 # Old image reused
	WEIGHT_DOMAIN_AGE = 0.08 # New/suspicious domain
	WEIGHT_CREDIBILITY = 0.07 # Source credibility
	WEIGHT_FACT_CHECK = 0.05 # Fact-checker verdict


	# ═══════════════════════════════════════════════════════════════════════════════
	# MAIN LAYER 4 FUNCTION
	# ═══════════════════════════════════════════════════════════════════════════════

	async def run_layer4(
	l1: Layer1Result,
	l2: Layer2Result,
	l3: Layer3Result,
	) -> Layer4Result:
	"""
	Synthesise all layer results into the final XAI verdict.

	Args:
	l1: Layer 1 semantic result
	l2: Layer 2 image result
	l3: Layer 3 authority result

	Returns:
	Layer4Result — final verdict with explanations
	"""
	logger.info("⚖️ Layer 4: Computing final verdict...")

	# ── Feature vector (all normalised to [0, 1] where 1 = more suspicious) ──
	features = _extract_features(l1, l2, l3)

	# ── Weighted ensemble score ───────────────────────────────────────────────
	ensemble_score = _compute_ensemble_score(features)

	# ── Risk level from ensemble score ────────────────────────────────────────
	risk = _score_to_risk(ensemble_score, l1, l2, l3)

	# ── Confidence (0–100%) ───────────────────────────────────────────────────
	confidence = _compute_confidence(ensemble_score, l1, l2, l3)

	# ── SHAP feature attributions ─────────────────────────────────────────────
	shap_features = _compute_shap_features(features)

	# ── Plain-English outputs ─────────────────────────────────────────────────
	key_reasons = _build_key_reasons(l1, l2, l3, risk)
	explanation = _build_explanation(risk, l1, l2, l3)
	transparency = _build_transparency_note(l1, l2, l3)
	recommendation = _build_recommendation(risk)

	logger.info(
	f"✅ Layer 4 done — risk={risk}, confidence={confidence:.1f}%, "
	f"ensemble_score={ensemble_score:.3f}"
	)

	return Layer4Result(
	overall_risk=risk,
	confidence_score=confidence,
	explanation=explanation,
	key_reasons=key_reasons,
	transparency_note=transparency,
	recommendation=recommendation,
	shap_features=shap_features,
	)


	# ═══════════════════════════════════════════════════════════════════════════════
	# FEATURE EXTRACTION
	# ═══════════════════════════════════════════════════════════════════════════════

	def _extract_features(
	l1: Layer1Result,
	l2: Layer2Result,
	l3: Layer3Result,
	) -> dict:
	"""
	Convert raw layer outputs into normalised [0, 1] feature values.
	0 = completely safe, 1 = completely suspicious.
	"""
	# Stance alignment: +1 (entailment) → 0 fake, -1 (contradiction) → 1 fake
	stance_fake = float(np.clip((1.0 - l1.stance_alignment) / 2.0, 0.0, 1.0))

	# Domain age: newer = more suspicious (cap at 24 months)
	domain_age_fake = float(np.clip(1.0 - (l3.domain_age_months / 24.0), 0.0, 1.0))

	# Credibility: lower score = more suspicious
	credibility_fake = float(np.clip(1.0 - (l3.credibility_score / 100.0), 0.0, 1.0))

	# Fact check: any FALSE verdict = strong signal
	fc_false_count = sum(
	1 for fc in l3.fact_check_results
	if "FALSE" in fc.rating.upper() or "INCORRECT" in fc.rating.upper()
	)
	fact_check_fake = float(np.clip(fc_false_count / max(len(l3.fact_check_results), 1), 0.0, 1.0))

	return {
	"semantic_confidence": l1.confidence_score, # Already 0=safe, 1=fake
	"stance": stance_fake,
	"emotional_bias": l1.emotional_bias,
	"clickbait": l1.clickbait_score,
	"deepfake": l2.deepfake_score,
	"image_reuse": 1.0 if l2.reverse_search_match else 0.0,
	"domain_age": domain_age_fake,
	"credibility": credibility_fake,
	"fact_check": fact_check_fake,
	# Binary boosters
	"imposter": 1.0 if l3.is_imposter else 0.0,
	"no_ssl": 0.0 if l3.ssl_valid else 0.8,
	"metadata_stripped": 0.0 if l2.metadata_integrity else 0.5,
	}


	# ═══════════════════════════════════════════════════════════════════════════════
	# ENSEMBLE SCORING
	# ═══════════════════════════════════════════════════════════════════════════════

	def _compute_ensemble_score(features: dict) -> float:
	"""
	Weighted linear combination of all features.
	Returns a score in [0, 1] where 1 = almost certainly fake.
	"""
	score = (
	features["semantic_confidence"] * WEIGHT_SEMANTIC_CONFIDENCE +
	features["stance"] * WEIGHT_STANCE +
	features["emotional_bias"] * WEIGHT_EMOTIONAL_BIAS +
	features["clickbait"] * WEIGHT_CLICKBAIT +
	features["deepfake"] * WEIGHT_DEEPFAKE +
	features["image_reuse"] * WEIGHT_IMAGE_REUSE +
	features["domain_age"] * WEIGHT_DOMAIN_AGE +
	features["credibility"] * WEIGHT_CREDIBILITY +
	features["fact_check"] * WEIGHT_FACT_CHECK
	)

	# Hard boosters for definitive signals
	if features["imposter"] > 0:
	score = max(score, 0.85) # Imposter site = almost certainly fake
	if features["no_ssl"] > 0:
	score += 0.05 # No SSL = small additional penalty

	return float(np.clip(score, 0.0, 1.0))


	def _score_to_risk(
	score: float,
	l1: Layer1Result,
	l2: Layer2Result,
	l3: Layer3Result,
	) -> RiskLevel:
	"""
	Convert ensemble score to one of 4 risk levels.
	Also considers hard rules (e.g., imposter = FAKE NEWS).
	"""
	# Hard rules (override score)
	if l3.is_imposter:
	return RiskLevel.FAKE_NEWS

	false_fact_checks = [
	fc for fc in l3.fact_check_results
	if "FALSE" in fc.rating.upper()
	]
	if len(false_fact_checks) >= 2:
	return RiskLevel.FAKE_NEWS

	# Score-based thresholds
	if score >= settings.FAKE_CONFIDENCE_THRESHOLD:
	return RiskLevel.FAKE_NEWS
	if score >= 0.55:
	return RiskLevel.MISLEADING
	if score >= settings.SUSPICIOUS_CONFIDENCE_THRESHOLD:
	return RiskLevel.BE_CAREFUL
	return RiskLevel.TRUSTWORTHY


	def _compute_confidence(
	ensemble_score: float,
	l1: Layer1Result,
	l2: Layer2Result,
	l3: Layer3Result,
	) -> float:
	"""
	Compute confidence in the verdict (0–100%).

	Higher confidence when:
	- Multiple layers agree
	- Hard signals are present (fact-checks, imposter, etc.)
	- Ensemble score is far from the thresholds
	"""
	# Base confidence from how far the score is from the nearest threshold
	thresholds = [0.0, settings.SUSPICIOUS_CONFIDENCE_THRESHOLD, 0.55,
	settings.FAKE_CONFIDENCE_THRESHOLD, 1.0]

	distances = [abs(ensemble_score - t) for t in thresholds]
	margin = min(distances[1:]) # Distance from nearest decision boundary

	base_confidence = 50.0 + margin * 100.0

	# Agreement bonus: layers agreeing increases confidence
	statuses = [l1.status.value, l2.status.value, l3.status.value]
	agreement = len(set(statuses))
	if agreement == 1: # All 3 layers agree
	base_confidence += 15.0
	elif agreement == 2: # 2 of 3 agree
	base_confidence += 5.0

	# Hard signal bonuses
	if l3.is_imposter:
	base_confidence += 10.0
	if any("FALSE" in fc.rating for fc in l3.fact_check_results):
	base_confidence += 10.0
	if l2.reverse_search_match and not l2.metadata_integrity:
	base_confidence += 5.0

	return float(np.clip(base_confidence, 50.0, 99.5))


	# ═══════════════════════════════════════════════════════════════════════════════
	# SHAP FEATURE ATTRIBUTIONS
	# ═══════════════════════════════════════════════════════════════════════════════

	def _compute_shap_features(features: dict) -> List[SHAPFeature]:
	"""
	Compute SHAP-style feature attribution values.

	In production, we would use actual SHAP TreeExplainer / KernelExplainer
	on the ensemble model. Here we compute additive attributions directly
	from the weighted feature contributions.

	Positive SHAP value = pushes toward FAKE
	Negative SHAP value = pushes toward TRUSTWORTHY
	"""
	weights = {
	"semantic_confidence": WEIGHT_SEMANTIC_CONFIDENCE,
	"stance": WEIGHT_STANCE,
	"emotional_bias": WEIGHT_EMOTIONAL_BIAS,
	"clickbait": WEIGHT_CLICKBAIT,
	"deepfake": WEIGHT_DEEPFAKE,
	"image_reuse": WEIGHT_IMAGE_REUSE,
	"domain_age": WEIGHT_DOMAIN_AGE,
	"credibility": WEIGHT_CREDIBILITY,
	"fact_check": WEIGHT_FACT_CHECK,
	}

	descriptions = {
	"semantic_confidence": "AI reading of the article text",
	"stance": "Headline vs article body mismatch",
	"emotional_bias": "Fear/anger language used",
	"clickbait": "Clickbait headline patterns",
	"deepfake": "Image manipulation probability",
	"image_reuse": "Old image reused for new story",
	"domain_age": "How new/unknown the website is",
	"credibility": "Website trust rating",
	"fact_check": "Fact-checker verdicts",
	}

	shap_features: List[SHAPFeature] = []

	for feat_name, weight in weights.items():
	feat_value = features.get(feat_name, 0.0)
	# SHAP value: how much this feature contributes above/below baseline (0.5)
	shap_val = (feat_value - 0.5) * weight * 2.0

	shap_features.append(SHAPFeature(
	feature=feat_name.replace("_", " ").title(),
	value=round(shap_val, 4),
	description=descriptions.get(feat_name, feat_name),
	))

	# Sort by absolute contribution (most impactful first)
	shap_features.sort(key=lambda f: abs(f.value), reverse=True)
	return shap_features[:8] # Return top 8 features


	# ═══════════════════════════════════════════════════════════════════════════════
	# PLAIN-ENGLISH OUTPUTS
	# ═══════════════════════════════════════════════════════════════════════════════

	def _build_key_reasons(
	l1: Layer1Result,
	l2: Layer2Result,
	l3: Layer3Result,
	risk: RiskLevel,
	) -> List[str]:
	"""
	Build a list of plain-language reasons (5–6 bullets) explaining the verdict.
	Each reason is written for an average Indian news reader.
	"""
	reasons: List[str] = []
	prefix = {
	RiskLevel.TRUSTWORTHY: "✅",
	RiskLevel.BE_CAREFUL: "⚠️",
	RiskLevel.MISLEADING: "🔶",
	RiskLevel.FAKE_NEWS: "🚨",
	}[risk]

	# ── Layer 1 reasons ───────────────────────────────────────────────────────
	if l1.stance_alignment > 0.6:
	reasons.append(f"✅ The headline accurately describes what the article actually says")
	elif l1.stance_alignment > 0.2:
	reasons.append(f"⚠️ The headline partially matches the article — some details may be exaggerated")
	else:
	reasons.append(f"🚨 The headline does NOT match the article body — this is a misleading headline")

	if l1.emotional_bias > settings.EMOTIONAL_BIAS_THRESHOLD:
	reasons.append(f"{prefix} Uses extreme fear-based or anger-provoking language to manipulate readers")
	elif l1.emotional_bias < 0.25:
	reasons.append("✅ Uses calm, factual language — not trying to scare or anger you")

	if l1.clickbait_score > settings.CLICKBAIT_THRESHOLD:
	reasons.append(f"{prefix} Headline uses clickbait tactics designed to get you to click without thinking")

	# ── Layer 2 reasons ───────────────────────────────────────────────────────
	if l2.image_found:
	if l2.reverse_search_match:
	reasons.append(f"🚨 The image is {l2.image_age or 'years'} old and being falsely used for this new story")
	elif l2.deepfake_score < 0.2:
	reasons.append("✅ Images appear to be original and unedited")

	if not l2.metadata_integrity:
	reasons.append(f"{prefix} Image metadata was deleted — a common trick to hide where the image really came from")

	# ── Layer 3 reasons ───────────────────────────────────────────────────────
	if l3.domain_age_months < 3:
	reasons.append(f"🚨 This website was created only {l3.domain_age} ago — brand new sites spreading news are suspicious")
	elif l3.domain_age_months > 36:
	reasons.append(f"✅ This website has been established for {l3.domain_age} — a sign of credibility")

	if l3.is_imposter:
	reasons.append(f"🚨 This website is pretending to be '{l3.mimicked_domain}' — a well-known news site. This is a common fake news trick")

	if not l3.ssl_valid:
	reasons.append(f"{prefix} This website has no security certificate — real news organisations always have HTTPS")

	false_checks = [fc for fc in l3.fact_check_results if "FALSE" in fc.rating.upper()]
	if false_checks:
	publishers = ", ".join(fc.publisher for fc in false_checks[:2])
	reasons.append(f"🚨 Fact-checkers ({publishers}) have already confirmed this story is FALSE")
	elif not l3.fact_check_results:
	if l3.credibility_score > 80:
	reasons.append(f"✅ No fact-checker has disputed this story from {l3.credibility_score}/100 rated source")

	return reasons[:7] # Max 7 reasons


	def _build_explanation(
	risk: RiskLevel,
	l1: Layer1Result,
	l2: Layer2Result,
	l3: Layer3Result,
	) -> str:
	"""Build a 2–3 sentence plain-English explanation of the verdict."""

	if risk == RiskLevel.TRUSTWORTHY:
	parts = [
	f"This article is from a well-established source with a credibility score of {l3.credibility_score}/100."
	]
	if l1.stance_alignment > 0.6:
	parts.append("The headline accurately matches what the article body actually says.")
	if not l2.reverse_search_match and l2.image_found:
	parts.append("The images appear to be original and have not been found in older, unrelated stories.")
	if not l3.fact_check_results:
	parts.append("No fact-checker has disputed any claim in this story.")
	return " ".join(parts)

	elif risk == RiskLevel.BE_CAREFUL:
	parts = [
	"This article makes claims that could not be fully verified."
	]
	if l3.domain_age_months < 12:
	parts.append(f"The website is only {l3.domain_age} old and is not a well-known source.")
	if l1.emotional_bias > 0.35:
	parts.append("The article uses language that is designed to provoke emotion rather than inform you.")
	if l3.fact_check_results:
	parts.append("Some claims in this article have been marked as unverified by fact-checkers.")
	parts.append("We recommend checking this story on a trusted news outlet before believing or sharing it.")
	return " ".join(parts)

	elif risk == RiskLevel.MISLEADING:
	parts = []
	if l1.stance_alignment < 0.3:
	parts.append("The headline is misleading — it does not accurately represent what the article actually says.")
	if l1.emotional_bias > 0.6:
	parts.append("The article uses extreme emotional language designed to make you react without thinking.")
	if l3.domain_age_months < 12:
	parts.append(f"The source website is only {l3.domain_age} old and has a low credibility score of {l3.credibility_score}/100.")
	if not parts:
	parts.append("Multiple warning signs were detected across this article — it is likely misleading.")
	return " ".join(parts)

	else: # FAKE_NEWS
	parts = ["This article shows strong signs of being deliberate fake news."]
	if l3.is_imposter:
	parts.append(f"The website '{l3.mimicked_domain}' is a fake site impersonating a real news outlet.")
	if l2.reverse_search_match:
	parts.append(f"The image used is {l2.image_age or 'years'} old and was stolen from an unrelated story.")
	false_checks = [fc for fc in l3.fact_check_results if "FALSE" in fc.rating.upper()]
	if false_checks:
	parts.append(f"The main claims have been confirmed FALSE by {false_checks[0].publisher}.")
	if l1.emotional_bias > 0.8:
	parts.append("It uses extreme fear-mongering language — a hallmark of disinformation.")
	return " ".join(parts)


	def _build_transparency_note(
	l1: Layer1Result,
	l2: Layer2Result,
	l3: Layer3Result,
	) -> str:
	"""
	A brief note explaining HOW SatyaCheck arrived at its conclusion.
	This is the 'glass box' XAI output — letting the user understand the process.
	"""
	checks_done = []
	checks_done.append("read and understood the full article text")

	if l2.image_found:
	checks_done.append("analysed the article's images for tampering and recycled content")

	checks_done.append(f"checked the website '{l3.domain_age}' domain for age, security, and credibility")

	if l3.fact_check_results:
	checks_done.append(f"found {len(l3.fact_check_results)} relevant fact-check report(s)")

	checks_str = ", ".join(checks_done[:-1]) + f", and {checks_done[-1]}"
	return (
	f"SatyaCheck's AI {checks_str}. "
	f"The verdict is based on {len([l1, l2, l3])} independent checks "
	f"working together — not just one signal."
	)


	def _build_recommendation(risk: RiskLevel) -> str:
	"""Final one-line actionable recommendation for the user."""
	return {
	RiskLevel.TRUSTWORTHY: "Safe to read and share with others.",
	RiskLevel.BE_CAREFUL: "Do not share until you verify this with a trusted news source like NDTV, The Hindu, or BBC India.",
	RiskLevel.MISLEADING: "Do not share this. The headline is misleading and likely does not reflect the truth.",
	RiskLevel.FAKE_NEWS: "Do NOT share this. This is fake news designed to mislead and divide people.",
	}[risk]