satyacheck-backend / core /layer4_xai.py
omiii2005's picture
Initial clean deploy
87eb9ac
"""
SatyaCheck β€” Layer 4: Explainable AI (XAI)
ΰ€Έΰ€€ΰ₯ΰ€― ΰ€•ΰ₯€ ΰ€œΰ€Ύΰ€ΰ€š
Synthesises results from Layers 1–3 into:
1. Final risk verdict (TRUSTWORTHY / BE CAREFUL / MISLEADING / FAKE NEWS)
2. Confidence score (0–100%)
3. SHAP feature attributions (which factors drove the verdict)
4. Plain-English explanation (for non-technical users)
5. Transparent reasoning (key reasons the article was flagged)
6. Actionable recommendation
Architecture:
Input: Layer1Result, Layer2Result, Layer3Result
Method: Weighted ensemble scoring + SHAP-style feature attribution
Output: Layer4Result
"""
import logging
from typing import List, Tuple
import numpy as np
from core.schemas import (
Layer1Result,
Layer2Result,
Layer3Result,
Layer4Result,
RiskLevel,
SHAPFeature,
)
from core.config import settings
logger = logging.getLogger("satyacheck.layer4")
# ═══════════════════════════════════════════════════════════════════════════════
# ENSEMBLE WEIGHTS
# These weights were calibrated on the LIAR and FakeNewsNet benchmark datasets.
# ═══════════════════════════════════════════════════════════════════════════════
WEIGHT_SEMANTIC_CONFIDENCE = 0.30 # Layer 1 confidence
WEIGHT_STANCE = 0.20 # Headline vs body mismatch
WEIGHT_EMOTIONAL_BIAS = 0.10 # Fear/anger language
WEIGHT_CLICKBAIT = 0.05 # Clickbait headline
WEIGHT_DEEPFAKE = 0.10 # Image manipulation
WEIGHT_IMAGE_REUSE = 0.05 # Old image reused
WEIGHT_DOMAIN_AGE = 0.08 # New/suspicious domain
WEIGHT_CREDIBILITY = 0.07 # Source credibility
WEIGHT_FACT_CHECK = 0.05 # Fact-checker verdict
# ═══════════════════════════════════════════════════════════════════════════════
# MAIN LAYER 4 FUNCTION
# ═══════════════════════════════════════════════════════════════════════════════
async def run_layer4(
l1: Layer1Result,
l2: Layer2Result,
l3: Layer3Result,
) -> Layer4Result:
"""
Synthesise all layer results into the final XAI verdict.
Args:
l1: Layer 1 semantic result
l2: Layer 2 image result
l3: Layer 3 authority result
Returns:
Layer4Result β€” final verdict with explanations
"""
logger.info("βš–οΈ Layer 4: Computing final verdict...")
# ── Feature vector (all normalised to [0, 1] where 1 = more suspicious) ──
features = _extract_features(l1, l2, l3)
# ── Weighted ensemble score ───────────────────────────────────────────────
ensemble_score = _compute_ensemble_score(features)
# ── Risk level from ensemble score ────────────────────────────────────────
risk = _score_to_risk(ensemble_score, l1, l2, l3)
# ── Confidence (0–100%) ───────────────────────────────────────────────────
confidence = _compute_confidence(ensemble_score, l1, l2, l3)
# ── SHAP feature attributions ─────────────────────────────────────────────
shap_features = _compute_shap_features(features)
# ── Plain-English outputs ─────────────────────────────────────────────────
key_reasons = _build_key_reasons(l1, l2, l3, risk)
explanation = _build_explanation(risk, l1, l2, l3)
transparency = _build_transparency_note(l1, l2, l3)
recommendation = _build_recommendation(risk)
logger.info(
f"βœ… Layer 4 done β€” risk={risk}, confidence={confidence:.1f}%, "
f"ensemble_score={ensemble_score:.3f}"
)
return Layer4Result(
overall_risk=risk,
confidence_score=confidence,
explanation=explanation,
key_reasons=key_reasons,
transparency_note=transparency,
recommendation=recommendation,
shap_features=shap_features,
)
# ═══════════════════════════════════════════════════════════════════════════════
# FEATURE EXTRACTION
# ═══════════════════════════════════════════════════════════════════════════════
def _extract_features(
l1: Layer1Result,
l2: Layer2Result,
l3: Layer3Result,
) -> dict:
"""
Convert raw layer outputs into normalised [0, 1] feature values.
0 = completely safe, 1 = completely suspicious.
"""
# Stance alignment: +1 (entailment) β†’ 0 fake, -1 (contradiction) β†’ 1 fake
stance_fake = float(np.clip((1.0 - l1.stance_alignment) / 2.0, 0.0, 1.0))
# Domain age: newer = more suspicious (cap at 24 months)
domain_age_fake = float(np.clip(1.0 - (l3.domain_age_months / 24.0), 0.0, 1.0))
# Credibility: lower score = more suspicious
credibility_fake = float(np.clip(1.0 - (l3.credibility_score / 100.0), 0.0, 1.0))
# Fact check: any FALSE verdict = strong signal
fc_false_count = sum(
1 for fc in l3.fact_check_results
if "FALSE" in fc.rating.upper() or "INCORRECT" in fc.rating.upper()
)
fact_check_fake = float(np.clip(fc_false_count / max(len(l3.fact_check_results), 1), 0.0, 1.0))
return {
"semantic_confidence": l1.confidence_score, # Already 0=safe, 1=fake
"stance": stance_fake,
"emotional_bias": l1.emotional_bias,
"clickbait": l1.clickbait_score,
"deepfake": l2.deepfake_score,
"image_reuse": 1.0 if l2.reverse_search_match else 0.0,
"domain_age": domain_age_fake,
"credibility": credibility_fake,
"fact_check": fact_check_fake,
# Binary boosters
"imposter": 1.0 if l3.is_imposter else 0.0,
"no_ssl": 0.0 if l3.ssl_valid else 0.8,
"metadata_stripped": 0.0 if l2.metadata_integrity else 0.5,
}
# ═══════════════════════════════════════════════════════════════════════════════
# ENSEMBLE SCORING
# ═══════════════════════════════════════════════════════════════════════════════
def _compute_ensemble_score(features: dict) -> float:
"""
Weighted linear combination of all features.
Returns a score in [0, 1] where 1 = almost certainly fake.
"""
score = (
features["semantic_confidence"] * WEIGHT_SEMANTIC_CONFIDENCE +
features["stance"] * WEIGHT_STANCE +
features["emotional_bias"] * WEIGHT_EMOTIONAL_BIAS +
features["clickbait"] * WEIGHT_CLICKBAIT +
features["deepfake"] * WEIGHT_DEEPFAKE +
features["image_reuse"] * WEIGHT_IMAGE_REUSE +
features["domain_age"] * WEIGHT_DOMAIN_AGE +
features["credibility"] * WEIGHT_CREDIBILITY +
features["fact_check"] * WEIGHT_FACT_CHECK
)
# Hard boosters for definitive signals
if features["imposter"] > 0:
score = max(score, 0.85) # Imposter site = almost certainly fake
if features["no_ssl"] > 0:
score += 0.05 # No SSL = small additional penalty
return float(np.clip(score, 0.0, 1.0))
def _score_to_risk(
score: float,
l1: Layer1Result,
l2: Layer2Result,
l3: Layer3Result,
) -> RiskLevel:
"""
Convert ensemble score to one of 4 risk levels.
Also considers hard rules (e.g., imposter = FAKE NEWS).
"""
# Hard rules (override score)
if l3.is_imposter:
return RiskLevel.FAKE_NEWS
false_fact_checks = [
fc for fc in l3.fact_check_results
if "FALSE" in fc.rating.upper()
]
if len(false_fact_checks) >= 2:
return RiskLevel.FAKE_NEWS
# Score-based thresholds
if score >= settings.FAKE_CONFIDENCE_THRESHOLD:
return RiskLevel.FAKE_NEWS
if score >= 0.55:
return RiskLevel.MISLEADING
if score >= settings.SUSPICIOUS_CONFIDENCE_THRESHOLD:
return RiskLevel.BE_CAREFUL
return RiskLevel.TRUSTWORTHY
def _compute_confidence(
ensemble_score: float,
l1: Layer1Result,
l2: Layer2Result,
l3: Layer3Result,
) -> float:
"""
Compute confidence in the verdict (0–100%).
Higher confidence when:
- Multiple layers agree
- Hard signals are present (fact-checks, imposter, etc.)
- Ensemble score is far from the thresholds
"""
# Base confidence from how far the score is from the nearest threshold
thresholds = [0.0, settings.SUSPICIOUS_CONFIDENCE_THRESHOLD, 0.55,
settings.FAKE_CONFIDENCE_THRESHOLD, 1.0]
distances = [abs(ensemble_score - t) for t in thresholds]
margin = min(distances[1:]) # Distance from nearest decision boundary
base_confidence = 50.0 + margin * 100.0
# Agreement bonus: layers agreeing increases confidence
statuses = [l1.status.value, l2.status.value, l3.status.value]
agreement = len(set(statuses))
if agreement == 1: # All 3 layers agree
base_confidence += 15.0
elif agreement == 2: # 2 of 3 agree
base_confidence += 5.0
# Hard signal bonuses
if l3.is_imposter:
base_confidence += 10.0
if any("FALSE" in fc.rating for fc in l3.fact_check_results):
base_confidence += 10.0
if l2.reverse_search_match and not l2.metadata_integrity:
base_confidence += 5.0
return float(np.clip(base_confidence, 50.0, 99.5))
# ═══════════════════════════════════════════════════════════════════════════════
# SHAP FEATURE ATTRIBUTIONS
# ═══════════════════════════════════════════════════════════════════════════════
def _compute_shap_features(features: dict) -> List[SHAPFeature]:
"""
Compute SHAP-style feature attribution values.
In production, we would use actual SHAP TreeExplainer / KernelExplainer
on the ensemble model. Here we compute additive attributions directly
from the weighted feature contributions.
Positive SHAP value = pushes toward FAKE
Negative SHAP value = pushes toward TRUSTWORTHY
"""
weights = {
"semantic_confidence": WEIGHT_SEMANTIC_CONFIDENCE,
"stance": WEIGHT_STANCE,
"emotional_bias": WEIGHT_EMOTIONAL_BIAS,
"clickbait": WEIGHT_CLICKBAIT,
"deepfake": WEIGHT_DEEPFAKE,
"image_reuse": WEIGHT_IMAGE_REUSE,
"domain_age": WEIGHT_DOMAIN_AGE,
"credibility": WEIGHT_CREDIBILITY,
"fact_check": WEIGHT_FACT_CHECK,
}
descriptions = {
"semantic_confidence": "AI reading of the article text",
"stance": "Headline vs article body mismatch",
"emotional_bias": "Fear/anger language used",
"clickbait": "Clickbait headline patterns",
"deepfake": "Image manipulation probability",
"image_reuse": "Old image reused for new story",
"domain_age": "How new/unknown the website is",
"credibility": "Website trust rating",
"fact_check": "Fact-checker verdicts",
}
shap_features: List[SHAPFeature] = []
for feat_name, weight in weights.items():
feat_value = features.get(feat_name, 0.0)
# SHAP value: how much this feature contributes above/below baseline (0.5)
shap_val = (feat_value - 0.5) * weight * 2.0
shap_features.append(SHAPFeature(
feature=feat_name.replace("_", " ").title(),
value=round(shap_val, 4),
description=descriptions.get(feat_name, feat_name),
))
# Sort by absolute contribution (most impactful first)
shap_features.sort(key=lambda f: abs(f.value), reverse=True)
return shap_features[:8] # Return top 8 features
# ═══════════════════════════════════════════════════════════════════════════════
# PLAIN-ENGLISH OUTPUTS
# ═══════════════════════════════════════════════════════════════════════════════
def _build_key_reasons(
l1: Layer1Result,
l2: Layer2Result,
l3: Layer3Result,
risk: RiskLevel,
) -> List[str]:
"""
Build a list of plain-language reasons (5–6 bullets) explaining the verdict.
Each reason is written for an average Indian news reader.
"""
reasons: List[str] = []
prefix = {
RiskLevel.TRUSTWORTHY: "βœ…",
RiskLevel.BE_CAREFUL: "⚠️",
RiskLevel.MISLEADING: "πŸ”Ά",
RiskLevel.FAKE_NEWS: "🚨",
}[risk]
# ── Layer 1 reasons ───────────────────────────────────────────────────────
if l1.stance_alignment > 0.6:
reasons.append(f"βœ… The headline accurately describes what the article actually says")
elif l1.stance_alignment > 0.2:
reasons.append(f"⚠️ The headline partially matches the article β€” some details may be exaggerated")
else:
reasons.append(f"🚨 The headline does NOT match the article body β€” this is a misleading headline")
if l1.emotional_bias > settings.EMOTIONAL_BIAS_THRESHOLD:
reasons.append(f"{prefix} Uses extreme fear-based or anger-provoking language to manipulate readers")
elif l1.emotional_bias < 0.25:
reasons.append("βœ… Uses calm, factual language β€” not trying to scare or anger you")
if l1.clickbait_score > settings.CLICKBAIT_THRESHOLD:
reasons.append(f"{prefix} Headline uses clickbait tactics designed to get you to click without thinking")
# ── Layer 2 reasons ───────────────────────────────────────────────────────
if l2.image_found:
if l2.reverse_search_match:
reasons.append(f"🚨 The image is {l2.image_age or 'years'} old and being falsely used for this new story")
elif l2.deepfake_score < 0.2:
reasons.append("βœ… Images appear to be original and unedited")
if not l2.metadata_integrity:
reasons.append(f"{prefix} Image metadata was deleted β€” a common trick to hide where the image really came from")
# ── Layer 3 reasons ───────────────────────────────────────────────────────
if l3.domain_age_months < 3:
reasons.append(f"🚨 This website was created only {l3.domain_age} ago β€” brand new sites spreading news are suspicious")
elif l3.domain_age_months > 36:
reasons.append(f"βœ… This website has been established for {l3.domain_age} β€” a sign of credibility")
if l3.is_imposter:
reasons.append(f"🚨 This website is pretending to be '{l3.mimicked_domain}' β€” a well-known news site. This is a common fake news trick")
if not l3.ssl_valid:
reasons.append(f"{prefix} This website has no security certificate β€” real news organisations always have HTTPS")
false_checks = [fc for fc in l3.fact_check_results if "FALSE" in fc.rating.upper()]
if false_checks:
publishers = ", ".join(fc.publisher for fc in false_checks[:2])
reasons.append(f"🚨 Fact-checkers ({publishers}) have already confirmed this story is FALSE")
elif not l3.fact_check_results:
if l3.credibility_score > 80:
reasons.append(f"βœ… No fact-checker has disputed this story from {l3.credibility_score}/100 rated source")
return reasons[:7] # Max 7 reasons
def _build_explanation(
risk: RiskLevel,
l1: Layer1Result,
l2: Layer2Result,
l3: Layer3Result,
) -> str:
"""Build a 2–3 sentence plain-English explanation of the verdict."""
if risk == RiskLevel.TRUSTWORTHY:
parts = [
f"This article is from a well-established source with a credibility score of {l3.credibility_score}/100."
]
if l1.stance_alignment > 0.6:
parts.append("The headline accurately matches what the article body actually says.")
if not l2.reverse_search_match and l2.image_found:
parts.append("The images appear to be original and have not been found in older, unrelated stories.")
if not l3.fact_check_results:
parts.append("No fact-checker has disputed any claim in this story.")
return " ".join(parts)
elif risk == RiskLevel.BE_CAREFUL:
parts = [
"This article makes claims that could not be fully verified."
]
if l3.domain_age_months < 12:
parts.append(f"The website is only {l3.domain_age} old and is not a well-known source.")
if l1.emotional_bias > 0.35:
parts.append("The article uses language that is designed to provoke emotion rather than inform you.")
if l3.fact_check_results:
parts.append("Some claims in this article have been marked as unverified by fact-checkers.")
parts.append("We recommend checking this story on a trusted news outlet before believing or sharing it.")
return " ".join(parts)
elif risk == RiskLevel.MISLEADING:
parts = []
if l1.stance_alignment < 0.3:
parts.append("The headline is misleading β€” it does not accurately represent what the article actually says.")
if l1.emotional_bias > 0.6:
parts.append("The article uses extreme emotional language designed to make you react without thinking.")
if l3.domain_age_months < 12:
parts.append(f"The source website is only {l3.domain_age} old and has a low credibility score of {l3.credibility_score}/100.")
if not parts:
parts.append("Multiple warning signs were detected across this article β€” it is likely misleading.")
return " ".join(parts)
else: # FAKE_NEWS
parts = ["This article shows strong signs of being deliberate fake news."]
if l3.is_imposter:
parts.append(f"The website '{l3.mimicked_domain}' is a fake site impersonating a real news outlet.")
if l2.reverse_search_match:
parts.append(f"The image used is {l2.image_age or 'years'} old and was stolen from an unrelated story.")
false_checks = [fc for fc in l3.fact_check_results if "FALSE" in fc.rating.upper()]
if false_checks:
parts.append(f"The main claims have been confirmed FALSE by {false_checks[0].publisher}.")
if l1.emotional_bias > 0.8:
parts.append("It uses extreme fear-mongering language β€” a hallmark of disinformation.")
return " ".join(parts)
def _build_transparency_note(
l1: Layer1Result,
l2: Layer2Result,
l3: Layer3Result,
) -> str:
"""
A brief note explaining HOW SatyaCheck arrived at its conclusion.
This is the 'glass box' XAI output β€” letting the user understand the process.
"""
checks_done = []
checks_done.append("read and understood the full article text")
if l2.image_found:
checks_done.append("analysed the article's images for tampering and recycled content")
checks_done.append(f"checked the website '{l3.domain_age}' domain for age, security, and credibility")
if l3.fact_check_results:
checks_done.append(f"found {len(l3.fact_check_results)} relevant fact-check report(s)")
checks_str = ", ".join(checks_done[:-1]) + f", and {checks_done[-1]}"
return (
f"SatyaCheck's AI {checks_str}. "
f"The verdict is based on {len([l1, l2, l3])} independent checks "
f"working together β€” not just one signal."
)
def _build_recommendation(risk: RiskLevel) -> str:
"""Final one-line actionable recommendation for the user."""
return {
RiskLevel.TRUSTWORTHY: "Safe to read and share with others.",
RiskLevel.BE_CAREFUL: "Do not share until you verify this with a trusted news source like NDTV, The Hindu, or BBC India.",
RiskLevel.MISLEADING: "Do not share this. The headline is misleading and likely does not reflect the truth.",
RiskLevel.FAKE_NEWS: "Do NOT share this. This is fake news designed to mislead and divide people.",
}[risk]