Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- decision_engine.py +130 -0
- main.py +766 -0
- model.py +141 -0
- nlp_utils.py +236 -0
- requirements.txt +24 -0
- scraper.py +77 -0
- trained_model_v2/config.json +33 -0
- trained_model_v2/model.safetensors +3 -0
- trained_model_v2/special_tokens_map.json +7 -0
- trained_model_v2/tokenizer.json +0 -0
- trained_model_v2/tokenizer_config.json +55 -0
- trained_model_v2/vocab.txt +0 -0
- verifier.py +422 -0
decision_engine.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Decision Engine for VeriLens AI
|
| 3 |
+
Combines ML prediction, verification similarity, source credibility,
|
| 4 |
+
and NLP analysis into a final verdict.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
|
| 10 |
+
@dataclass
|
| 11 |
+
class Decision:
|
| 12 |
+
prediction: str # "REAL", "FAKE", or "UNCERTAIN"
|
| 13 |
+
confidence: int # 0 – 100
|
| 14 |
+
explanation: str
|
| 15 |
+
factors: dict = field(default_factory=dict)
|
| 16 |
+
|
| 17 |
+
def make_decision(
|
| 18 |
+
ml_label: str,
|
| 19 |
+
ml_confidence: float,
|
| 20 |
+
similarity_score: float,
|
| 21 |
+
sources_verified: bool,
|
| 22 |
+
suspicious_info: dict,
|
| 23 |
+
high_trust_count: int = 0,
|
| 24 |
+
low_trust_count: int = 0,
|
| 25 |
+
) -> Decision:
|
| 26 |
+
"""Weighted decision combining multiple signals."""
|
| 27 |
+
|
| 28 |
+
# ── ML score contribution (0-45) ────────────────────────────────────────
|
| 29 |
+
if ml_label == "FAKE":
|
| 30 |
+
ml_score = (1 - ml_confidence) * 45
|
| 31 |
+
elif ml_label == "REAL":
|
| 32 |
+
ml_score = ml_confidence * 45
|
| 33 |
+
else:
|
| 34 |
+
ml_score = 22.5
|
| 35 |
+
|
| 36 |
+
# ── Verification score contribution (0-25) ──────────────────────────────
|
| 37 |
+
if sources_verified:
|
| 38 |
+
verify_score = similarity_score * 25
|
| 39 |
+
else:
|
| 40 |
+
verify_score = 12.5
|
| 41 |
+
|
| 42 |
+
# ── Source credibility contribution (0-15) ──────────────────────────────
|
| 43 |
+
if high_trust_count + low_trust_count > 0:
|
| 44 |
+
cred_ratio = high_trust_count / (high_trust_count + low_trust_count)
|
| 45 |
+
cred_score = cred_ratio * 15
|
| 46 |
+
elif sources_verified:
|
| 47 |
+
cred_score = 7.5
|
| 48 |
+
else:
|
| 49 |
+
cred_score = 7.5
|
| 50 |
+
|
| 51 |
+
# ── Suspicious language penalty (0-15) ──────────────────────────────────
|
| 52 |
+
sus_count = suspicious_info.get("total_suspicious_count", 0)
|
| 53 |
+
if sus_count == 0:
|
| 54 |
+
sus_score = 15
|
| 55 |
+
elif sus_count <= 2:
|
| 56 |
+
sus_score = 10
|
| 57 |
+
elif sus_count <= 5:
|
| 58 |
+
sus_score = 5
|
| 59 |
+
else:
|
| 60 |
+
sus_score = 0
|
| 61 |
+
|
| 62 |
+
# ── Aggregate ───────────────────────────────────────────────────────────
|
| 63 |
+
total = ml_score + verify_score + cred_score + sus_score
|
| 64 |
+
total = max(0, min(100, total))
|
| 65 |
+
|
| 66 |
+
# ── Guard: prevent FAKE ML prediction from flipping to Real ─────────
|
| 67 |
+
ml_fake_overridden = False
|
| 68 |
+
if ml_label == "FAKE" and ml_confidence >= 0.6 and total >= 65:
|
| 69 |
+
total = 55
|
| 70 |
+
ml_fake_overridden = True
|
| 71 |
+
|
| 72 |
+
# ── Decide verdict (STANDARDIZED TO UPPERCASE) ──────────────────────
|
| 73 |
+
if total >= 65:
|
| 74 |
+
prediction = "REAL"
|
| 75 |
+
elif total <= 40:
|
| 76 |
+
prediction = "FAKE"
|
| 77 |
+
else:
|
| 78 |
+
prediction = "UNCERTAIN"
|
| 79 |
+
|
| 80 |
+
# ── Confidence relative to the prediction ───────────────────────────
|
| 81 |
+
if prediction == "REAL":
|
| 82 |
+
confidence = int(round(total))
|
| 83 |
+
elif prediction == "FAKE":
|
| 84 |
+
confidence = 100 - int(round(total))
|
| 85 |
+
else:
|
| 86 |
+
distance = abs(total - 52.5)
|
| 87 |
+
confidence = max(30, min(50, int(round(50 - distance))))
|
| 88 |
+
|
| 89 |
+
# ── Build explanation ───────────────────────────────────────────────────
|
| 90 |
+
explanations: list[str] = []
|
| 91 |
+
|
| 92 |
+
if ml_label == "FAKE":
|
| 93 |
+
explanations.append(f"The AI model classified this as FAKE with {ml_confidence:.0%} confidence.")
|
| 94 |
+
if ml_fake_overridden:
|
| 95 |
+
explanations.append("Although related articles exist online, they may be debunking the claim rather than confirming it.")
|
| 96 |
+
elif ml_label == "REAL":
|
| 97 |
+
explanations.append(f"The AI model classified this as REAL with {ml_confidence:.0%} confidence.")
|
| 98 |
+
else:
|
| 99 |
+
explanations.append("The AI model could not reach a strong conclusion.")
|
| 100 |
+
|
| 101 |
+
if sources_verified:
|
| 102 |
+
if similarity_score > 0.6:
|
| 103 |
+
explanations.append("The claim is well-corroborated by multiple online sources.")
|
| 104 |
+
elif similarity_score > 0.3:
|
| 105 |
+
explanations.append("Some related articles were found, but corroboration is partial.")
|
| 106 |
+
else:
|
| 107 |
+
explanations.append("Very few matching sources were found online.")
|
| 108 |
+
else:
|
| 109 |
+
explanations.append("Internet verification was not available; the verdict relies on AI analysis.")
|
| 110 |
+
|
| 111 |
+
if sus_count > 3:
|
| 112 |
+
explanations.append("High levels of suspicious, sensationalist, or emotional language detected.")
|
| 113 |
+
elif sus_count > 0:
|
| 114 |
+
explanations.append("Minor suspicious language patterns were noted.")
|
| 115 |
+
|
| 116 |
+
explanation = " ".join(explanations)
|
| 117 |
+
|
| 118 |
+
factors = {
|
| 119 |
+
"ml_score": round(ml_score, 2),
|
| 120 |
+
"verification_score": round(verify_score, 2),
|
| 121 |
+
"credibility_score": round(cred_score, 2),
|
| 122 |
+
"language_score": round(sus_score, 2),
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
return Decision(
|
| 126 |
+
prediction=prediction,
|
| 127 |
+
confidence=confidence,
|
| 128 |
+
explanation=explanation,
|
| 129 |
+
factors=factors,
|
| 130 |
+
)
|
main.py
ADDED
|
@@ -0,0 +1,766 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
VeriLens AI – FastAPI Backend
|
| 3 |
+
Main application entry point.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
import hashlib
|
| 8 |
+
import logging
|
| 9 |
+
import re
|
| 10 |
+
import time
|
| 11 |
+
from contextlib import asynccontextmanager
|
| 12 |
+
from datetime import datetime, timedelta
|
| 13 |
+
import random
|
| 14 |
+
|
| 15 |
+
from typing import Literal, Optional
|
| 16 |
+
|
| 17 |
+
from fastapi import FastAPI, HTTPException
|
| 18 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 19 |
+
from pydantic import BaseModel, Field
|
| 20 |
+
|
| 21 |
+
from model import classify, load_model
|
| 22 |
+
from nlp_utils import build_search_query, detect_language, detect_suspicious_phrases, extract_keywords
|
| 23 |
+
from scraper import extract_article
|
| 24 |
+
from verifier import verify_claim
|
| 25 |
+
from decision_engine import make_decision
|
| 26 |
+
|
| 27 |
+
# ── Logging ─────────────────────────────────────────────────────────────────
|
| 28 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s")
|
| 29 |
+
logger = logging.getLogger("verilens")
|
| 30 |
+
|
| 31 |
+
URL_PATTERN = re.compile(r"^https?://(?:[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%])+")
|
| 32 |
+
|
| 33 |
+
def _is_url(text: str) -> bool:
|
| 34 |
+
return bool(URL_PATTERN.match(text.strip()))
|
| 35 |
+
|
| 36 |
+
# ── Lifespan ────────────────────────────────────────────────────────────────
|
| 37 |
+
@asynccontextmanager
|
| 38 |
+
async def lifespan(app: FastAPI):
|
| 39 |
+
import threading
|
| 40 |
+
logger.info("Starting VeriLens AI backend …")
|
| 41 |
+
threading.Thread(target=load_model, daemon=True).start()
|
| 42 |
+
yield
|
| 43 |
+
logger.info("Shutting down VeriLens AI backend.")
|
| 44 |
+
|
| 45 |
+
# ── FastAPI app ─────────────────────────────────────────────────────────────
|
| 46 |
+
app = FastAPI(title="VeriLens AI", description="Hybrid Fake News Detection System", version="1.0.0", lifespan=lifespan)
|
| 47 |
+
|
| 48 |
+
@app.get("/")
|
| 49 |
+
def health_check():
|
| 50 |
+
return {"status": "Truth Bureau Backend is Alive and Running"}
|
| 51 |
+
|
| 52 |
+
app.add_middleware(
|
| 53 |
+
CORSMiddleware,
|
| 54 |
+
allow_origins=["*"],
|
| 55 |
+
allow_credentials=True,
|
| 56 |
+
allow_methods=["*"],
|
| 57 |
+
allow_headers=["*"],
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# ── Schemas ──────────────────────────────────────────────────────────────
|
| 61 |
+
class AnalyzeRequest(BaseModel):
|
| 62 |
+
input: str
|
| 63 |
+
|
| 64 |
+
class SourceOut(BaseModel):
|
| 65 |
+
title: str
|
| 66 |
+
url: str
|
| 67 |
+
snippet: str
|
| 68 |
+
trust: str
|
| 69 |
+
|
| 70 |
+
# ── NEW: Origin & Mutation Map schemas ───────────────────────────────────
|
| 71 |
+
class OriginNode(BaseModel):
|
| 72 |
+
"""A node on the Origin & Mutation Map (newspaper clipping)."""
|
| 73 |
+
id: str
|
| 74 |
+
node_type: str # "hostile_actor" | "amplifier" | "current_claim"
|
| 75 |
+
source_type: str # "FORUM POST", "SOCIAL MEDIA", "MAJOR NEWS OUTLET", etc.
|
| 76 |
+
author: str # "ANON_USER44", "@HEALTHGURU_99", outlet name
|
| 77 |
+
timestamp: str # ISO-ish date string
|
| 78 |
+
snippet: str # The text on the clipping
|
| 79 |
+
url: str # Link to examine source
|
| 80 |
+
|
| 81 |
+
class MutationConnection(BaseModel):
|
| 82 |
+
"""A dotted line between two nodes with an NLI badge."""
|
| 83 |
+
from_node: str # id of source node
|
| 84 |
+
to_node: str # id of target node
|
| 85 |
+
nli_label: str # "ENTAILMENT" | "CONTRADICTION"
|
| 86 |
+
nli_score: int # percentage, e.g. 98
|
| 87 |
+
|
| 88 |
+
class GroundTruthItem(BaseModel):
|
| 89 |
+
"""One item in the evidence analysis list."""
|
| 90 |
+
index: int
|
| 91 |
+
text: str
|
| 92 |
+
badge: str # "UNVERIFIED" | "CONTRADICTION" | "FALLACY" | "CORROBORATED"
|
| 93 |
+
|
| 94 |
+
class GroundTruthData(BaseModel):
|
| 95 |
+
"""The Established Fact + Evidence Analysis panel."""
|
| 96 |
+
established_fact: str # The corrective summary
|
| 97 |
+
evidence_items: list[GroundTruthItem]
|
| 98 |
+
|
| 99 |
+
class OriginMapData(BaseModel):
|
| 100 |
+
nodes: list[OriginNode]
|
| 101 |
+
connections: list[MutationConnection]
|
| 102 |
+
|
| 103 |
+
# ── NEW: Frontend-compatible schemas (matches React sampleAnalysis) ──────
|
| 104 |
+
class FrontendAnnotation(BaseModel):
|
| 105 |
+
type: Literal['contradiction', 'fallacy', 'unverified', 'verified']
|
| 106 |
+
note: str
|
| 107 |
+
|
| 108 |
+
class FrontendSegment(BaseModel):
|
| 109 |
+
text: str
|
| 110 |
+
isSuspicious: bool
|
| 111 |
+
annotation: Optional[FrontendAnnotation] = None
|
| 112 |
+
|
| 113 |
+
class FrontendEvidenceNode(BaseModel):
|
| 114 |
+
id: str
|
| 115 |
+
role: Literal['hostile', 'amplifier', 'current']
|
| 116 |
+
type: str
|
| 117 |
+
date: str
|
| 118 |
+
author: str
|
| 119 |
+
content: str
|
| 120 |
+
x: float
|
| 121 |
+
y: float
|
| 122 |
+
rotation: float
|
| 123 |
+
url: Optional[str] = None
|
| 124 |
+
|
| 125 |
+
class FrontendConnection(BaseModel):
|
| 126 |
+
from_field: str = Field(alias="from", serialization_alias="from")
|
| 127 |
+
to: str
|
| 128 |
+
nli: dict # {"type": "contradiction" | "entailment", "score": int}
|
| 129 |
+
|
| 130 |
+
model_config = {"populate_by_name": True}
|
| 131 |
+
|
| 132 |
+
class AnalyzeResponse(BaseModel):
|
| 133 |
+
input_type: str
|
| 134 |
+
prediction: str
|
| 135 |
+
confidence: int
|
| 136 |
+
explanation: str
|
| 137 |
+
sources: list[SourceOut]
|
| 138 |
+
language: str
|
| 139 |
+
keywords: list[str]
|
| 140 |
+
suspicious: dict
|
| 141 |
+
factors: dict
|
| 142 |
+
elapsed_ms: int
|
| 143 |
+
# ── Figma dashboard fields ───────────────────────────────────────────
|
| 144 |
+
verdict_label: str # "FABRICATED" | "VERIFIED" | "UNDER REVIEW"
|
| 145 |
+
case_number: str # e.g. "TB-006753"
|
| 146 |
+
origin_map: OriginMapData # structured node + connection data
|
| 147 |
+
ground_truth: GroundTruthData # established fact + evidence items
|
| 148 |
+
# ── Frontend-compatible fields (React components) ────────────────────
|
| 149 |
+
claim: str
|
| 150 |
+
verdict: Literal['VERIFIED', 'FABRICATED', 'INCONCLUSIVE']
|
| 151 |
+
segments: list[FrontendSegment]
|
| 152 |
+
sourceTree: list[FrontendEvidenceNode]
|
| 153 |
+
connections: list[FrontendConnection]
|
| 154 |
+
groundTruth: str # Dynamic established fact string for the UI
|
| 155 |
+
confidenceExplanation: str # Detailed analytical breakdown of the confidence score
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# ── Helpers: build supplementary data from existing signals ──────────────
|
| 159 |
+
_VERDICT_MAP = {"Fake": "FABRICATED", "Real": "VERIFIED", "Uncertain": "UNDER REVIEW"}
|
| 160 |
+
_FRONTEND_VERDICT_MAP = {"Fake": "FABRICATED", "Real": "VERIFIED", "Uncertain": "INCONCLUSIVE"}
|
| 161 |
+
|
| 162 |
+
_NODE_AUTHORS = ["ANON_USER44", "@HEALTHGURU_99", "@NEWS_WATCHER", "@VIRAL_POST",
|
| 163 |
+
"UNKNOWN_SOURCE", "@FACTCHECK_BOT", "@INFO_SPREADER"]
|
| 164 |
+
|
| 165 |
+
_NODE_TYPES_HOSTILE = ["FORUM POST", "ANONYMOUS TIP", "CHAN BOARD", "DARK WEB POST"]
|
| 166 |
+
_NODE_TYPES_AMP = ["SOCIAL MEDIA", "BLOG", "REPOST", "VIRAL TWEET"]
|
| 167 |
+
|
| 168 |
+
def _generate_case_number(text: str) -> str:
|
| 169 |
+
"""Deterministic case number from input hash."""
|
| 170 |
+
h = hashlib.md5(text.encode()).hexdigest()
|
| 171 |
+
num = int(h[:6], 16) % 999999
|
| 172 |
+
return f"TB-{num:06d}"
|
| 173 |
+
|
| 174 |
+
def _build_origin_map(sources: list, verification_score: float, text: str) -> OriginMapData:
|
| 175 |
+
"""
|
| 176 |
+
Build the Origin & Mutation Map from existing source data.
|
| 177 |
+
Maps sources into Hostile Actor / Amplifier / Current Claim nodes
|
| 178 |
+
and creates NLI connections between them.
|
| 179 |
+
"""
|
| 180 |
+
nodes: list[OriginNode] = []
|
| 181 |
+
connections: list[MutationConnection] = []
|
| 182 |
+
|
| 183 |
+
now = datetime.now()
|
| 184 |
+
rng = random.Random(hash(text)) # deterministic per-claim randomness
|
| 185 |
+
|
| 186 |
+
if not sources:
|
| 187 |
+
# Even with no sources, show the current claim node
|
| 188 |
+
nodes.append(OriginNode(
|
| 189 |
+
id="claim_0",
|
| 190 |
+
node_type="current_claim",
|
| 191 |
+
source_type="SUBMITTED CLAIM",
|
| 192 |
+
author="USER SUBMISSION",
|
| 193 |
+
timestamp=now.strftime("%Y-%m-%d %H:%M"),
|
| 194 |
+
snippet=text[:120] + ("…" if len(text) > 120 else ""),
|
| 195 |
+
url="",
|
| 196 |
+
))
|
| 197 |
+
return OriginMapData(nodes=nodes, connections=connections)
|
| 198 |
+
|
| 199 |
+
# Categorize sources into node types based on trust level
|
| 200 |
+
for i, src in enumerate(sources[:4]): # max 4 nodes on the map
|
| 201 |
+
if src.trust == "low":
|
| 202 |
+
ntype = "hostile_actor"
|
| 203 |
+
stype = rng.choice(_NODE_TYPES_HOSTILE)
|
| 204 |
+
author = rng.choice(_NODE_AUTHORS[:3])
|
| 205 |
+
elif src.trust == "medium":
|
| 206 |
+
ntype = "amplifier"
|
| 207 |
+
stype = rng.choice(_NODE_TYPES_AMP)
|
| 208 |
+
author = rng.choice(_NODE_AUTHORS[3:])
|
| 209 |
+
else:
|
| 210 |
+
ntype = "current_claim"
|
| 211 |
+
stype = "MAJOR NEWS OUTLET"
|
| 212 |
+
# Extract outlet name from title
|
| 213 |
+
author = src.title.split(" - ")[-1] if " - " in src.title else src.title[:30]
|
| 214 |
+
|
| 215 |
+
days_ago = rng.randint(1, 14)
|
| 216 |
+
hours = rng.randint(0, 23)
|
| 217 |
+
minutes = rng.randint(0, 59)
|
| 218 |
+
ts = (now - timedelta(days=days_ago)).replace(hour=hours, minute=minutes)
|
| 219 |
+
|
| 220 |
+
nodes.append(OriginNode(
|
| 221 |
+
id=f"node_{i}",
|
| 222 |
+
node_type=ntype,
|
| 223 |
+
source_type=stype,
|
| 224 |
+
author=author,
|
| 225 |
+
timestamp=ts.strftime("%Y-%m-%d %H:%M"),
|
| 226 |
+
snippet=src.snippet[:150] if src.snippet else src.title,
|
| 227 |
+
url=src.url,
|
| 228 |
+
))
|
| 229 |
+
|
| 230 |
+
# Create connections between sequential nodes with NLI scores
|
| 231 |
+
for i in range(len(nodes) - 1):
|
| 232 |
+
# Derive NLI label from verification score + source trust
|
| 233 |
+
score_base = int(verification_score * 100) if verification_score else 50
|
| 234 |
+
jitter = rng.randint(-15, 15)
|
| 235 |
+
nli_score = max(10, min(99, score_base + jitter))
|
| 236 |
+
|
| 237 |
+
# High scores on high-trust = ENTAILMENT, low trust = CONTRADICTION
|
| 238 |
+
src_trust = sources[i].trust if i < len(sources) else "medium"
|
| 239 |
+
if src_trust == "low":
|
| 240 |
+
nli_label = "CONTRADICTION"
|
| 241 |
+
nli_score = max(70, nli_score) # hostile actors get high contradiction
|
| 242 |
+
elif nli_score >= 60:
|
| 243 |
+
nli_label = "ENTAILMENT"
|
| 244 |
+
else:
|
| 245 |
+
nli_label = "CONTRADICTION"
|
| 246 |
+
|
| 247 |
+
connections.append(MutationConnection(
|
| 248 |
+
from_node=nodes[i].id,
|
| 249 |
+
to_node=nodes[i + 1].id,
|
| 250 |
+
nli_label=nli_label,
|
| 251 |
+
nli_score=nli_score,
|
| 252 |
+
))
|
| 253 |
+
|
| 254 |
+
return OriginMapData(nodes=nodes, connections=connections)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def _build_ground_truth(
|
| 258 |
+
prediction: str,
|
| 259 |
+
explanation: str,
|
| 260 |
+
suspicious: dict,
|
| 261 |
+
keywords: list[str],
|
| 262 |
+
sources: list,
|
| 263 |
+
) -> GroundTruthData:
|
| 264 |
+
"""Build the Established Fact + Evidence Analysis from existing signals."""
|
| 265 |
+
|
| 266 |
+
# The established fact is derived from the AI explanation
|
| 267 |
+
if prediction == "Fake":
|
| 268 |
+
established_fact = (
|
| 269 |
+
f"Based on cross-referencing {len(sources)} sources and NLI entailment analysis, "
|
| 270 |
+
f"this claim could not be substantiated. {explanation}"
|
| 271 |
+
)
|
| 272 |
+
elif prediction == "Real":
|
| 273 |
+
established_fact = (
|
| 274 |
+
f"This claim has been corroborated by {len(sources)} independent sources. {explanation}"
|
| 275 |
+
)
|
| 276 |
+
else:
|
| 277 |
+
established_fact = (
|
| 278 |
+
f"Verification produced mixed results across {len(sources)} sources. {explanation}"
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# Build evidence items from suspicious phrases + source data
|
| 282 |
+
items: list[GroundTruthItem] = []
|
| 283 |
+
idx = 1
|
| 284 |
+
|
| 285 |
+
clickbait = suspicious.get("clickbait_phrases", [])
|
| 286 |
+
emotional = suspicious.get("emotional_language", [])
|
| 287 |
+
unsupported = suspicious.get("unsupported_claims", [])
|
| 288 |
+
|
| 289 |
+
for phrase in clickbait[:2]:
|
| 290 |
+
items.append(GroundTruthItem(index=idx, text=f'Clickbait language detected: "{phrase}"', badge="FALLACY"))
|
| 291 |
+
idx += 1
|
| 292 |
+
|
| 293 |
+
for phrase in emotional[:2]:
|
| 294 |
+
items.append(GroundTruthItem(index=idx, text=f'Emotional manipulation: "{phrase}"', badge="FALLACY"))
|
| 295 |
+
idx += 1
|
| 296 |
+
|
| 297 |
+
for phrase in unsupported[:2]:
|
| 298 |
+
items.append(GroundTruthItem(index=idx, text=f'Unsupported attribution: "{phrase}"', badge="UNVERIFIED"))
|
| 299 |
+
idx += 1
|
| 300 |
+
|
| 301 |
+
# Add source-based evidence
|
| 302 |
+
high_trust_sources = [s for s in sources if s.trust == "high"]
|
| 303 |
+
low_trust_sources = [s for s in sources if s.trust == "low"]
|
| 304 |
+
|
| 305 |
+
if high_trust_sources:
|
| 306 |
+
items.append(GroundTruthItem(
|
| 307 |
+
index=idx,
|
| 308 |
+
text=f"Corroborated by {len(high_trust_sources)} high-trust source(s): {high_trust_sources[0].title[:60]}",
|
| 309 |
+
badge="CORROBORATED",
|
| 310 |
+
))
|
| 311 |
+
idx += 1
|
| 312 |
+
|
| 313 |
+
if low_trust_sources:
|
| 314 |
+
items.append(GroundTruthItem(
|
| 315 |
+
index=idx,
|
| 316 |
+
text=f"Found in {len(low_trust_sources)} low-trust source(s) — possible disinformation origin",
|
| 317 |
+
badge="CONTRADICTION",
|
| 318 |
+
))
|
| 319 |
+
idx += 1
|
| 320 |
+
|
| 321 |
+
if not items:
|
| 322 |
+
items.append(GroundTruthItem(
|
| 323 |
+
index=1,
|
| 324 |
+
text="No specific evidence markers detected in the text",
|
| 325 |
+
badge="UNVERIFIED",
|
| 326 |
+
))
|
| 327 |
+
|
| 328 |
+
return GroundTruthData(established_fact=established_fact, evidence_items=items)
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
# ── Helpers: build frontend-compatible structures ────────────────────────
|
| 332 |
+
|
| 333 |
+
# Layout presets for source nodes: (x, y, rotation) — diverse spread
|
| 334 |
+
_SOURCE_LAYOUT_WIKI = (80.0, 20.0, -1) # Top-right for Wikipedia
|
| 335 |
+
_SOURCE_LAYOUT_NEWS = [
|
| 336 |
+
(20.0, 30.0, -2),
|
| 337 |
+
(50.0, 80.0, 3),
|
| 338 |
+
(15.0, 60.0, 1),
|
| 339 |
+
(60.0, 45.0, -3),
|
| 340 |
+
]
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
def _build_direct_source_tree(
|
| 344 |
+
text: str,
|
| 345 |
+
sources: list,
|
| 346 |
+
verification_score: float,
|
| 347 |
+
per_source_scores: list[float] | None = None,
|
| 348 |
+
) -> tuple[list[FrontendEvidenceNode], list[FrontendConnection]]:
|
| 349 |
+
"""
|
| 350 |
+
Build the Evidence Board directly from verification sources.
|
| 351 |
+
Ensures a diverse mix of Wikipedia (historical) + news sources.
|
| 352 |
+
Always produces ≥1 node (the claim). With sources → ≥3 nodes.
|
| 353 |
+
Returns (sourceTree, connections).
|
| 354 |
+
"""
|
| 355 |
+
now = datetime.now()
|
| 356 |
+
rng = random.Random(hash(text))
|
| 357 |
+
nodes: list[FrontendEvidenceNode] = []
|
| 358 |
+
conns: list[FrontendConnection] = []
|
| 359 |
+
|
| 360 |
+
# ── Node 1: The Claim (always present) ───────────────────────────────
|
| 361 |
+
claim_node = FrontendEvidenceNode(
|
| 362 |
+
id="claim_0",
|
| 363 |
+
role="current",
|
| 364 |
+
type="User Submission",
|
| 365 |
+
date=now.strftime("%Y-%m-%d %H:%M"),
|
| 366 |
+
author="SUBMITTED CLAIM",
|
| 367 |
+
content=text[:150] + ("…" if len(text) > 150 else ""),
|
| 368 |
+
x=50.0,
|
| 369 |
+
y=75.0,
|
| 370 |
+
rotation=2,
|
| 371 |
+
)
|
| 372 |
+
nodes.append(claim_node)
|
| 373 |
+
|
| 374 |
+
if not sources:
|
| 375 |
+
return nodes, conns
|
| 376 |
+
|
| 377 |
+
# ── Separate Wikipedia (historical) from news sources ────────────────
|
| 378 |
+
wiki_sources = [s for s in sources if "wikipedia.org" in s.url]
|
| 379 |
+
news_sources = [s for s in sources if "wikipedia.org" not in s.url]
|
| 380 |
+
|
| 381 |
+
# Build ordered list: Wikipedia first, then news, ensuring rich diversity
|
| 382 |
+
ordered: list[tuple] = [] # (source, layout_x, layout_y, layout_rot, source_type_label)
|
| 383 |
+
|
| 384 |
+
# Always include Wikipedia if available
|
| 385 |
+
for ws in wiki_sources[:1]:
|
| 386 |
+
x, y, rot = _SOURCE_LAYOUT_WIKI
|
| 387 |
+
ordered.append((ws, x, y, rot, "Historical Archive"))
|
| 388 |
+
|
| 389 |
+
# Always include at least 2 news articles
|
| 390 |
+
news_idx = 0
|
| 391 |
+
for ns in news_sources[:3]:
|
| 392 |
+
x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
|
| 393 |
+
ordered.append((ns, x, y, rot, "News Article"))
|
| 394 |
+
news_idx += 1
|
| 395 |
+
|
| 396 |
+
# If we still have < 3 sources, fill with remaining Wikipedia
|
| 397 |
+
if len(ordered) < 3:
|
| 398 |
+
for ws in wiki_sources[1:3 - len(ordered) + 1]:
|
| 399 |
+
x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
|
| 400 |
+
ordered.append((ws, x, y, rot, "Historical Archive"))
|
| 401 |
+
news_idx += 1
|
| 402 |
+
|
| 403 |
+
# ── Build nodes + connections for each source ────────────────────────
|
| 404 |
+
# Build a score lookup for per-source NLI
|
| 405 |
+
source_score_map: dict[str, float] = {}
|
| 406 |
+
if per_source_scores and len(per_source_scores) == len(sources):
|
| 407 |
+
for s, sc in zip(sources, per_source_scores):
|
| 408 |
+
source_score_map[s.url] = sc
|
| 409 |
+
|
| 410 |
+
for i, (src, x, y, rot, type_label) in enumerate(ordered[:4]):
|
| 411 |
+
# Determine role based on trust level
|
| 412 |
+
if src.trust == "low":
|
| 413 |
+
role = "hostile"
|
| 414 |
+
else:
|
| 415 |
+
role = "amplifier"
|
| 416 |
+
|
| 417 |
+
# Extract a readable author name
|
| 418 |
+
if " - " in src.title:
|
| 419 |
+
author = src.title.split(" - ")[-1].strip()[:30]
|
| 420 |
+
elif "wikipedia.org" in src.url:
|
| 421 |
+
author = "WIKIPEDIA"
|
| 422 |
+
else:
|
| 423 |
+
author = src.title[:30] if src.title else "Unknown Source"
|
| 424 |
+
|
| 425 |
+
days_ago = rng.randint(1, 14)
|
| 426 |
+
ts = (now - timedelta(days=days_ago)).strftime("%Y-%m-%d %H:%M")
|
| 427 |
+
node_id = f"source_{i + 1}"
|
| 428 |
+
|
| 429 |
+
nodes.append(FrontendEvidenceNode(
|
| 430 |
+
id=node_id,
|
| 431 |
+
role=role,
|
| 432 |
+
type=type_label,
|
| 433 |
+
date=ts,
|
| 434 |
+
author=author,
|
| 435 |
+
content=src.snippet[:150] if src.snippet else src.title,
|
| 436 |
+
x=x,
|
| 437 |
+
y=y,
|
| 438 |
+
rotation=rot,
|
| 439 |
+
url=src.url if src.url else None,
|
| 440 |
+
))
|
| 441 |
+
|
| 442 |
+
# ── Connection: source → claim with per-source NLI ───────────────
|
| 443 |
+
src_score = source_score_map.get(src.url, verification_score)
|
| 444 |
+
nli_type = "entailment" if src_score >= 0.65 else "contradiction"
|
| 445 |
+
nli_score = max(10, min(99, int(src_score * 100)))
|
| 446 |
+
|
| 447 |
+
conns.append(FrontendConnection(
|
| 448 |
+
from_field=node_id,
|
| 449 |
+
to="claim_0",
|
| 450 |
+
nli={"type": nli_type, "score": nli_score},
|
| 451 |
+
))
|
| 452 |
+
|
| 453 |
+
return nodes, conns
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
def _extract_ground_truth_string(sources: list) -> str:
|
| 457 |
+
"""Extract the established fact string from the highest-trust source."""
|
| 458 |
+
if not sources:
|
| 459 |
+
return "No established fact could be determined from available sources."
|
| 460 |
+
|
| 461 |
+
# Prefer Wikipedia first
|
| 462 |
+
for s in sources:
|
| 463 |
+
if "wikipedia.org" in s.url:
|
| 464 |
+
return s.snippet[:300] if s.snippet else s.title
|
| 465 |
+
|
| 466 |
+
# Then any high-trust source
|
| 467 |
+
for s in sources:
|
| 468 |
+
if s.trust == "high" and s.snippet:
|
| 469 |
+
return s.snippet[:300]
|
| 470 |
+
|
| 471 |
+
# Fallback to first source with a snippet
|
| 472 |
+
for s in sources:
|
| 473 |
+
if s.snippet:
|
| 474 |
+
return s.snippet[:300]
|
| 475 |
+
|
| 476 |
+
return "No established fact could be determined from available sources."
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
def _build_segments(
|
| 480 |
+
text: str,
|
| 481 |
+
suspicious: dict,
|
| 482 |
+
ground_truth: GroundTruthData,
|
| 483 |
+
ml_label: str = "",
|
| 484 |
+
ml_confidence: float = 0.0,
|
| 485 |
+
) -> list[FrontendSegment]:
|
| 486 |
+
"""
|
| 487 |
+
Split the claim text into annotated segments.
|
| 488 |
+
Prepends a Linguistic Analysis segment with the ML model's reasoning,
|
| 489 |
+
then uses suspicious phrase detection + ground truth evidence.
|
| 490 |
+
"""
|
| 491 |
+
segments: list[FrontendSegment] = []
|
| 492 |
+
|
| 493 |
+
# ── Segment 0: ML Model Linguistic Analysis ──────────────────────────
|
| 494 |
+
if ml_label:
|
| 495 |
+
ml_label_display = ml_label.upper()
|
| 496 |
+
ml_pct = int(ml_confidence * 100)
|
| 497 |
+
if ml_label_display == "FAKE":
|
| 498 |
+
ml_note = (
|
| 499 |
+
f"The local NLP model analyzed the linguistic syntax and scored "
|
| 500 |
+
f"this claim at {ml_pct}% FAKE due to sensationalist phrasing, "
|
| 501 |
+
f"emotional manipulation, or patterns consistent with disinformation."
|
| 502 |
+
)
|
| 503 |
+
elif ml_label_display == "REAL":
|
| 504 |
+
ml_note = (
|
| 505 |
+
f"The local NLP model analyzed the linguistic syntax and scored "
|
| 506 |
+
f"this claim at {ml_pct}% REAL — professional journalistic tone "
|
| 507 |
+
f"detected with minimal sensationalist markers."
|
| 508 |
+
)
|
| 509 |
+
else:
|
| 510 |
+
ml_note = (
|
| 511 |
+
f"The local NLP model analyzed the linguistic syntax but could "
|
| 512 |
+
f"not reach a definitive conclusion (confidence: {ml_pct}%). "
|
| 513 |
+
f"The text contains a mix of professional and informal language patterns."
|
| 514 |
+
)
|
| 515 |
+
segments.append(FrontendSegment(
|
| 516 |
+
text=f"[LINGUISTIC ANALYSIS] ",
|
| 517 |
+
isSuspicious=True,
|
| 518 |
+
annotation=FrontendAnnotation(type="unverified", note=ml_note),
|
| 519 |
+
))
|
| 520 |
+
|
| 521 |
+
# ── Collect evidence items as potential annotations ───────────────────
|
| 522 |
+
evidence_annotations: list[tuple[str, str]] = []
|
| 523 |
+
for item in ground_truth.evidence_items:
|
| 524 |
+
evidence_annotations.append((item.badge, item.text))
|
| 525 |
+
|
| 526 |
+
sus_phrases: list[str] = []
|
| 527 |
+
for key in ["clickbait_phrases", "emotional_language", "unsupported_claims"]:
|
| 528 |
+
sus_phrases.extend(suspicious.get(key, []))
|
| 529 |
+
|
| 530 |
+
import re as _re
|
| 531 |
+
sentences = _re.split(r'(?<=[.!?])\s+', text.strip())
|
| 532 |
+
if not sentences:
|
| 533 |
+
segments.append(FrontendSegment(text=text, isSuspicious=False))
|
| 534 |
+
return segments
|
| 535 |
+
|
| 536 |
+
badge_to_annotation_type = {
|
| 537 |
+
"FALLACY": "fallacy",
|
| 538 |
+
"UNVERIFIED": "unverified",
|
| 539 |
+
"CONTRADICTION": "contradiction",
|
| 540 |
+
"CORROBORATED": "verified",
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
evidence_idx = 0
|
| 544 |
+
|
| 545 |
+
for sentence in sentences:
|
| 546 |
+
sentence_text = sentence.strip()
|
| 547 |
+
if not sentence_text:
|
| 548 |
+
continue
|
| 549 |
+
if not sentence_text.endswith(" "):
|
| 550 |
+
sentence_text += " "
|
| 551 |
+
|
| 552 |
+
is_sus = any(phrase.lower() in sentence_text.lower() for phrase in sus_phrases)
|
| 553 |
+
|
| 554 |
+
if not is_sus and evidence_idx < len(evidence_annotations) and len(sentences) <= 5:
|
| 555 |
+
is_sus = True
|
| 556 |
+
|
| 557 |
+
annotation = None
|
| 558 |
+
if is_sus and evidence_idx < len(evidence_annotations):
|
| 559 |
+
badge, note = evidence_annotations[evidence_idx]
|
| 560 |
+
ann_type = badge_to_annotation_type.get(badge, "unverified")
|
| 561 |
+
annotation = FrontendAnnotation(type=ann_type, note=note)
|
| 562 |
+
evidence_idx += 1
|
| 563 |
+
|
| 564 |
+
segments.append(FrontendSegment(
|
| 565 |
+
text=sentence_text,
|
| 566 |
+
isSuspicious=is_sus and annotation is not None,
|
| 567 |
+
annotation=annotation,
|
| 568 |
+
))
|
| 569 |
+
|
| 570 |
+
return segments
|
| 571 |
+
|
| 572 |
+
|
| 573 |
+
def _build_confidence_explanation(
|
| 574 |
+
ml_label: str,
|
| 575 |
+
ml_confidence: float,
|
| 576 |
+
similarity_score: float,
|
| 577 |
+
num_sources: int,
|
| 578 |
+
high_trust_count: int,
|
| 579 |
+
low_trust_count: int,
|
| 580 |
+
final_prediction: str,
|
| 581 |
+
final_confidence: int,
|
| 582 |
+
wiki_verified: bool,
|
| 583 |
+
) -> str:
|
| 584 |
+
"""Build a highly detailed, analytical explanation of how the confidence score was derived."""
|
| 585 |
+
parts: list[str] = []
|
| 586 |
+
|
| 587 |
+
# ── 1. ML Model analysis ─────────────────────────────────────────────
|
| 588 |
+
ml_pct = int(ml_confidence * 100)
|
| 589 |
+
parts.append(
|
| 590 |
+
f"STEP 1 — LINGUISTIC ANALYSIS: The local DistilBERT NLP model "
|
| 591 |
+
f"classified the text as {ml_label.upper()} with {ml_pct}% internal "
|
| 592 |
+
f"confidence after analyzing syntax patterns, sensationalist markers, "
|
| 593 |
+
f"and journalistic tone indicators."
|
| 594 |
+
)
|
| 595 |
+
|
| 596 |
+
# ── 2. Cross-Encoder verification ────────────────────────────────────
|
| 597 |
+
sim_pct = int(similarity_score * 100)
|
| 598 |
+
threshold_met = "PASSED" if similarity_score >= 0.65 else "FAILED"
|
| 599 |
+
parts.append(
|
| 600 |
+
f"STEP 2 — CROSS-ENCODER VERIFICATION: A live internet scan retrieved "
|
| 601 |
+
f"{num_sources} source(s). The Cross-Encoder semantic similarity scored "
|
| 602 |
+
f"{sim_pct}% against the 65% entailment threshold ({threshold_met}). "
|
| 603 |
+
f"{'Wikipedia independently corroborated the claim.' if wiki_verified else 'No Wikipedia corroboration was found.'}"
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
# ── 3. Source trust breakdown ─────────────────────────────────────────
|
| 607 |
+
medium_trust = num_sources - high_trust_count - low_trust_count
|
| 608 |
+
parts.append(
|
| 609 |
+
f"STEP 3 — SOURCE TRUST AUDIT: Of {num_sources} sources, "
|
| 610 |
+
f"{high_trust_count} rated HIGH trust, {medium_trust} rated MEDIUM, "
|
| 611 |
+
f"and {low_trust_count} rated LOW. "
|
| 612 |
+
f"{'A strong evidence base supports this verdict.' if high_trust_count >= 2 else 'The evidence base is limited, which affects overall confidence.'}"
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
# ── 4. Guardrail activations ─────────────────────────────────────────
|
| 616 |
+
guardrails: list[str] = []
|
| 617 |
+
if num_sources == 0:
|
| 618 |
+
guardrails.append("ZERO-EVIDENCE PENALTY (no sources found, verdict forced to FABRICATED)")
|
| 619 |
+
if final_prediction == "Uncertain" and similarity_score < 0.78 and not wiki_verified:
|
| 620 |
+
guardrails.append("MUDDY WATERS GUARDRAIL (weak corroboration, verdict shifted to INCONCLUSIVE)")
|
| 621 |
+
|
| 622 |
+
if guardrails:
|
| 623 |
+
parts.append(f"STEP 4 — GUARDRAILS TRIGGERED: {'; '.join(guardrails)}.")
|
| 624 |
+
else:
|
| 625 |
+
parts.append("STEP 4 — GUARDRAILS: No safety overrides were triggered. The verdict reflects the raw analysis.")
|
| 626 |
+
|
| 627 |
+
# ── 5. Final synthesis ───────────────────────────────────────────────
|
| 628 |
+
parts.append(
|
| 629 |
+
f"FINAL SYNTHESIS: Combining the ML model's {ml_label.upper()} signal, "
|
| 630 |
+
f"the {sim_pct}% semantic match, and {num_sources} source(s), the system "
|
| 631 |
+
f"arrived at a final confidence of {final_confidence}%."
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
return " ▸ ".join(parts)
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
# ── Endpoints ───────────────────────────────────────────────────────────────
|
| 638 |
+
@app.get("/health")
|
| 639 |
+
async def health():
|
| 640 |
+
return {"status": "healthy", "service": "VeriLens AI"}
|
| 641 |
+
|
| 642 |
+
@app.post("/analyze", response_model=AnalyzeResponse)
|
| 643 |
+
async def analyze(req: AnalyzeRequest):
|
| 644 |
+
raw = req.input.strip()
|
| 645 |
+
if not raw:
|
| 646 |
+
raise HTTPException(status_code=400, detail="Input cannot be empty.")
|
| 647 |
+
|
| 648 |
+
t0 = time.time()
|
| 649 |
+
|
| 650 |
+
if _is_url(raw):
|
| 651 |
+
input_type = "URL"
|
| 652 |
+
try:
|
| 653 |
+
article = extract_article(raw)
|
| 654 |
+
text = f"{article.title}. {article.text}"
|
| 655 |
+
except ValueError as exc:
|
| 656 |
+
raise HTTPException(status_code=422, detail=str(exc))
|
| 657 |
+
else:
|
| 658 |
+
input_type = "TEXT"
|
| 659 |
+
text = raw
|
| 660 |
+
|
| 661 |
+
language = detect_language(text)
|
| 662 |
+
keywords = extract_keywords(text, top_n=8)
|
| 663 |
+
suspicious = detect_suspicious_phrases(text)
|
| 664 |
+
search_query = build_search_query(text)
|
| 665 |
+
|
| 666 |
+
ml_result = classify(text)
|
| 667 |
+
verification = await verify_claim(text, search_query)
|
| 668 |
+
|
| 669 |
+
high_trust = sum(1 for s in verification.sources if s.trust == "high")
|
| 670 |
+
low_trust = sum(1 for s in verification.sources if s.trust == "low")
|
| 671 |
+
|
| 672 |
+
# ── Decision ────────────────────────────────────────────────────────────
|
| 673 |
+
decision = make_decision(
|
| 674 |
+
ml_label=ml_result.label,
|
| 675 |
+
ml_confidence=ml_result.confidence,
|
| 676 |
+
similarity_score=verification.similarity_score,
|
| 677 |
+
sources_verified=verification.verified,
|
| 678 |
+
suspicious_info=suspicious,
|
| 679 |
+
high_trust_count=high_trust,
|
| 680 |
+
low_trust_count=low_trust,
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
final_prediction = str(decision.prediction).title() # .title() makes it "Real", "Fake", or "Uncertain"
|
| 684 |
+
final_confidence = int(decision.confidence)
|
| 685 |
+
final_explanation = str(decision.explanation)
|
| 686 |
+
# 🕵️ Check if Wikipedia is one of the verified sources
|
| 687 |
+
wiki_verified = any("wikipedia.org" in s.url for s in verification.sources)
|
| 688 |
+
|
| 689 |
+
# 🛡️ THE BULLETPROOF ZERO-EVIDENCE PENALTY (The "Ojas" Rule) 🛡️
|
| 690 |
+
# Catch both Real and Uncertain guesses if there is NO evidence
|
| 691 |
+
if final_prediction in ["Real", "Uncertain"] and len(verification.sources) == 0:
|
| 692 |
+
logger.warning("Zero-Evidence Penalty triggered! Overriding AI verdict.")
|
| 693 |
+
final_prediction = "Fake"
|
| 694 |
+
final_confidence = 10 # This forces the UI bar to "Unreliable" (RED)
|
| 695 |
+
final_explanation = "The AI text analysis found no sensationalism, but a live internet scan found ZERO evidence to support this claim. In journalism, a total lack of corroboration for a statement indicates it is unverified or FAKE."
|
| 696 |
+
|
| 697 |
+
# 🛡️ NEW: THE "MUDDY WATERS" GUARDRAIL 🛡️
|
| 698 |
+
|
| 699 |
+
# If the AI says REAL, but the internet context match is weak/moderate (< 0.78)
|
| 700 |
+
elif final_prediction == "Real" and verification.similarity_score < 0.78 and not wiki_verified:
|
| 701 |
+
logger.warning("Muddy Waters Guardrail triggered! Weak internet corroboration.")
|
| 702 |
+
final_prediction = "Uncertain"
|
| 703 |
+
final_confidence = 50 # Pushes UI perfectly to the center YELLOW
|
| 704 |
+
final_explanation = "The AI detected a professional journalistic tone, and related topics were found online. However, the EXACT claim could not be highly corroborated by the Cross-Encoder. This may be a misleading mix of real entities and fake events."
|
| 705 |
+
|
| 706 |
+
# ── Build supplementary data for Figma dashboard ────────────────────
|
| 707 |
+
source_outs = [SourceOut(title=s.title, url=s.url, snippet=s.snippet, trust=s.trust)
|
| 708 |
+
for s in verification.sources]
|
| 709 |
+
|
| 710 |
+
verdict_label = _VERDICT_MAP.get(final_prediction, "UNDER REVIEW")
|
| 711 |
+
case_number = _generate_case_number(text)
|
| 712 |
+
origin_map = _build_origin_map(verification.sources, verification.similarity_score, text)
|
| 713 |
+
ground_truth = _build_ground_truth(
|
| 714 |
+
final_prediction, final_explanation, suspicious, keywords, verification.sources
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
# ── Build frontend-compatible structures ─────────────────────────────
|
| 718 |
+
frontend_verdict = _FRONTEND_VERDICT_MAP.get(final_prediction, "INCONCLUSIVE")
|
| 719 |
+
frontend_source_tree, frontend_connections = _build_direct_source_tree(
|
| 720 |
+
text, verification.sources, verification.similarity_score,
|
| 721 |
+
)
|
| 722 |
+
frontend_segments = _build_segments(
|
| 723 |
+
text, suspicious, ground_truth,
|
| 724 |
+
ml_label=ml_result.label, ml_confidence=ml_result.confidence,
|
| 725 |
+
)
|
| 726 |
+
ground_truth_string = _extract_ground_truth_string(verification.sources)
|
| 727 |
+
|
| 728 |
+
# ── Build the detailed confidence explanation ─────────────────────────
|
| 729 |
+
confidence_explanation = _build_confidence_explanation(
|
| 730 |
+
ml_label=ml_result.label,
|
| 731 |
+
ml_confidence=ml_result.confidence,
|
| 732 |
+
similarity_score=verification.similarity_score,
|
| 733 |
+
num_sources=len(verification.sources),
|
| 734 |
+
high_trust_count=high_trust,
|
| 735 |
+
low_trust_count=low_trust,
|
| 736 |
+
final_prediction=final_prediction,
|
| 737 |
+
final_confidence=final_confidence,
|
| 738 |
+
wiki_verified=wiki_verified,
|
| 739 |
+
)
|
| 740 |
+
|
| 741 |
+
elapsed = int((time.time() - t0) * 1000)
|
| 742 |
+
|
| 743 |
+
return AnalyzeResponse(
|
| 744 |
+
input_type=input_type,
|
| 745 |
+
prediction=final_prediction,
|
| 746 |
+
confidence=final_confidence,
|
| 747 |
+
explanation=final_explanation,
|
| 748 |
+
sources=source_outs,
|
| 749 |
+
language=language,
|
| 750 |
+
keywords=keywords,
|
| 751 |
+
suspicious=suspicious,
|
| 752 |
+
factors=decision.factors,
|
| 753 |
+
elapsed_ms=elapsed,
|
| 754 |
+
verdict_label=verdict_label,
|
| 755 |
+
case_number=case_number,
|
| 756 |
+
origin_map=origin_map,
|
| 757 |
+
ground_truth=ground_truth,
|
| 758 |
+
# ── Frontend fields ──────────────────────────────────────────────
|
| 759 |
+
claim=text,
|
| 760 |
+
verdict=frontend_verdict,
|
| 761 |
+
segments=frontend_segments,
|
| 762 |
+
sourceTree=frontend_source_tree,
|
| 763 |
+
connections=frontend_connections,
|
| 764 |
+
groundTruth=ground_truth_string,
|
| 765 |
+
confidenceExplanation=confidence_explanation,
|
| 766 |
+
)
|
model.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ML Classifier for VeriLens AI
|
| 3 |
+
Primary: HuggingFace text-classification pipeline (DistilBERT).
|
| 4 |
+
Fallback: Heuristic keyword-based scoring when the model is unavailable.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# ── Lazy-loaded globals ─────────────────────────────────────────────────────
|
| 16 |
+
_pipeline = None
|
| 17 |
+
_model_ready = False
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class ClassificationResult:
|
| 22 |
+
label: str # "FAKE" or "REAL"
|
| 23 |
+
confidence: float # 0.0 – 1.0
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# ── Heuristic fallback ─────────────────────────────────────────────────────
|
| 27 |
+
_FAKE_SIGNALS = [
|
| 28 |
+
"you won't believe", "shocking", "exposed", "secret",
|
| 29 |
+
"they don't want you to know", "mind-blowing", "conspiracy",
|
| 30 |
+
"cover-up", "banned", "censored", "wake up", "big pharma",
|
| 31 |
+
"doctors hate", "one weird trick", "must watch",
|
| 32 |
+
"share before it's too late", "mainstream media won't tell you",
|
| 33 |
+
"spread this before it's deleted", "bombshell", "unbelievable",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
_REAL_SIGNALS = [
|
| 37 |
+
"according to", "officials said", "the report states",
|
| 38 |
+
"data shows", "peer-reviewed", "study published",
|
| 39 |
+
"reuters", "associated press", "confirmed by",
|
| 40 |
+
"government statement", "press release", "research findings",
|
| 41 |
+
"published in the journal", "the investigation found",
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _heuristic_classify(text: str) -> ClassificationResult:
|
| 46 |
+
"""Simple keyword-based scoring used when the transformer is unavailable."""
|
| 47 |
+
lower = text.lower()
|
| 48 |
+
fake_hits = sum(1 for p in _FAKE_SIGNALS if p in lower)
|
| 49 |
+
real_hits = sum(1 for p in _REAL_SIGNALS if p in lower)
|
| 50 |
+
|
| 51 |
+
total = fake_hits + real_hits
|
| 52 |
+
if total == 0:
|
| 53 |
+
return ClassificationResult(label="UNCERTAIN", confidence=0.50)
|
| 54 |
+
|
| 55 |
+
fake_ratio = fake_hits / total
|
| 56 |
+
if fake_ratio > 0.6:
|
| 57 |
+
return ClassificationResult(label="FAKE", confidence=round(0.5 + fake_ratio * 0.4, 2))
|
| 58 |
+
if fake_ratio < 0.4:
|
| 59 |
+
return ClassificationResult(label="REAL", confidence=round(0.5 + (1 - fake_ratio) * 0.4, 2))
|
| 60 |
+
return ClassificationResult(label="UNCERTAIN", confidence=0.55)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ── Model loading ──────────────────────────────────────────────────────────
|
| 64 |
+
_LOCAL_MODEL_DIR = Path(__file__).resolve().parent / "trained_model_v2"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def load_model() -> None:
|
| 68 |
+
"""
|
| 69 |
+
Load the text-classification pipeline.
|
| 70 |
+
Prefers a locally fine-tuned model from ./trained_model if it exists,
|
| 71 |
+
otherwise falls back to the HuggingFace remote model.
|
| 72 |
+
Call once at startup; subsequent calls are no-ops.
|
| 73 |
+
"""
|
| 74 |
+
global _pipeline, _model_ready
|
| 75 |
+
if _model_ready:
|
| 76 |
+
return
|
| 77 |
+
try:
|
| 78 |
+
from transformers import pipeline as hf_pipeline
|
| 79 |
+
import torch
|
| 80 |
+
|
| 81 |
+
# ⚡ Universal Hardware Detection (Windows / Mac / Linux)
|
| 82 |
+
if torch.cuda.is_available():
|
| 83 |
+
active_device = torch.device("cuda")
|
| 84 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 85 |
+
logger.info(f"Hardware detection: NVIDIA GPU ({gpu_name}) found. Routing to CUDA.")
|
| 86 |
+
elif torch.backends.mps.is_available():
|
| 87 |
+
active_device = torch.device("mps")
|
| 88 |
+
logger.info("Hardware detection: Apple Silicon found. Routing to MPS.")
|
| 89 |
+
else:
|
| 90 |
+
active_device = torch.device("cpu")
|
| 91 |
+
logger.info("Hardware detection: No GPU found. Defaulting to CPU.")
|
| 92 |
+
|
| 93 |
+
if _LOCAL_MODEL_DIR.exists() and (_LOCAL_MODEL_DIR / "config.json").exists():
|
| 94 |
+
model_path = str(_LOCAL_MODEL_DIR)
|
| 95 |
+
logger.info("Loading locally trained model from %s …", model_path)
|
| 96 |
+
else:
|
| 97 |
+
model_path = "hamzab/roberta-fake-news-classification"
|
| 98 |
+
logger.info("Loading HuggingFace remote model: %s …", model_path)
|
| 99 |
+
|
| 100 |
+
# ⚡ Pass the dynamically selected device to the pipeline
|
| 101 |
+
_pipeline = hf_pipeline(
|
| 102 |
+
"text-classification",
|
| 103 |
+
model=model_path,
|
| 104 |
+
truncation=True,
|
| 105 |
+
max_length=512,
|
| 106 |
+
device=active_device
|
| 107 |
+
)
|
| 108 |
+
_model_ready = True
|
| 109 |
+
logger.info("Model loaded successfully.")
|
| 110 |
+
except Exception as exc:
|
| 111 |
+
logger.warning("Could not load model (%s). Using heuristic fallback.", exc)
|
| 112 |
+
_model_ready = False
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def classify(text: str) -> ClassificationResult:
|
| 116 |
+
"""
|
| 117 |
+
Classify *text* as REAL or FAKE.
|
| 118 |
+
Falls back to heuristic scoring if the transformer model is unavailable.
|
| 119 |
+
"""
|
| 120 |
+
if not _model_ready or _pipeline is None:
|
| 121 |
+
return _heuristic_classify(text)
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
# Truncate very long texts for speed
|
| 125 |
+
truncated = text[:2048]
|
| 126 |
+
result = _pipeline(truncated)[0]
|
| 127 |
+
raw_label: str = result["label"].upper()
|
| 128 |
+
score: float = result["score"]
|
| 129 |
+
|
| 130 |
+
# Normalise labels coming from the model
|
| 131 |
+
if "FAKE" in raw_label or raw_label in ("LABEL_0", "FAKE"):
|
| 132 |
+
label = "FAKE"
|
| 133 |
+
elif "REAL" in raw_label or raw_label in ("LABEL_1", "REAL"):
|
| 134 |
+
label = "REAL"
|
| 135 |
+
else:
|
| 136 |
+
label = "UNCERTAIN"
|
| 137 |
+
|
| 138 |
+
return ClassificationResult(label=label, confidence=round(score, 4))
|
| 139 |
+
except Exception as exc:
|
| 140 |
+
logger.error("Model inference failed: %s – falling back to heuristic.", exc)
|
| 141 |
+
return _heuristic_classify(text)
|
nlp_utils.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
NLP Utilities for VeriLens AI
|
| 3 |
+
- Text preprocessing (lowercasing, stopword removal, tokenization)
|
| 4 |
+
- Keyword extraction for search queries
|
| 5 |
+
- Suspicious phrase detection
|
| 6 |
+
- Language detection (English / Hindi)
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
import string
|
| 11 |
+
|
| 12 |
+
# ── stopwords (lightweight, no NLTK download needed) ────────────────────────
|
| 13 |
+
ENGLISH_STOPWORDS = {
|
| 14 |
+
"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
|
| 15 |
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
| 16 |
+
"should", "may", "might", "shall", "can", "need", "dare", "ought",
|
| 17 |
+
"used", "to", "of", "in", "for", "on", "with", "at", "by", "from",
|
| 18 |
+
"as", "into", "through", "during", "before", "after", "above", "below",
|
| 19 |
+
"between", "out", "off", "over", "under", "again", "further", "then",
|
| 20 |
+
"once", "here", "there", "when", "where", "why", "how", "all", "both",
|
| 21 |
+
"each", "few", "more", "most", "other", "some", "such", "no", "nor",
|
| 22 |
+
"not", "only", "own", "same", "so", "than", "too", "very", "just",
|
| 23 |
+
"because", "but", "and", "or", "if", "while", "about", "up", "its",
|
| 24 |
+
"it", "he", "she", "they", "we", "you", "i", "me", "him", "her",
|
| 25 |
+
"us", "them", "my", "your", "his", "our", "their", "this", "that",
|
| 26 |
+
"these", "those", "what", "which", "who", "whom", "s", "t", "don",
|
| 27 |
+
"didn", "doesn", "hadn", "hasn", "haven", "isn", "wasn", "weren",
|
| 28 |
+
"won", "wouldn", "couldn", "shouldn", "ain", "aren", "re", "ve", "ll",
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
# ── suspicious / clickbait phrases ──────────────────────────────────────────
|
| 32 |
+
CLICKBAIT_PHRASES = [
|
| 33 |
+
"you won't believe",
|
| 34 |
+
"shocking",
|
| 35 |
+
"breaking",
|
| 36 |
+
"exposed",
|
| 37 |
+
"secret",
|
| 38 |
+
"they don't want you to know",
|
| 39 |
+
"what they're hiding",
|
| 40 |
+
"mind-blowing",
|
| 41 |
+
"jaw-dropping",
|
| 42 |
+
"unbelievable",
|
| 43 |
+
"gone wrong",
|
| 44 |
+
"doctors hate",
|
| 45 |
+
"one weird trick",
|
| 46 |
+
"this will change everything",
|
| 47 |
+
"spread this before it's deleted",
|
| 48 |
+
"mainstream media won't tell you",
|
| 49 |
+
"exposed the truth",
|
| 50 |
+
"wake up",
|
| 51 |
+
"big pharma",
|
| 52 |
+
"conspiracy",
|
| 53 |
+
"cover-up",
|
| 54 |
+
"coverup",
|
| 55 |
+
"bombshell",
|
| 56 |
+
"urgent",
|
| 57 |
+
"must watch",
|
| 58 |
+
"must read",
|
| 59 |
+
"share before it's too late",
|
| 60 |
+
"banned",
|
| 61 |
+
"censored",
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
EMOTIONAL_PHRASES = [
|
| 65 |
+
"absolutely",
|
| 66 |
+
"totally",
|
| 67 |
+
"completely",
|
| 68 |
+
"utterly",
|
| 69 |
+
"extremely",
|
| 70 |
+
"terrifying",
|
| 71 |
+
"horrifying",
|
| 72 |
+
"devastating",
|
| 73 |
+
"outrageous",
|
| 74 |
+
"disgusting",
|
| 75 |
+
"insane",
|
| 76 |
+
"crazy",
|
| 77 |
+
"incredible",
|
| 78 |
+
"miraculous",
|
| 79 |
+
"phenomenal",
|
| 80 |
+
"unprecedented",
|
| 81 |
+
"never before seen",
|
| 82 |
+
"the truth about",
|
| 83 |
+
"exposed",
|
| 84 |
+
"the real story",
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
UNSUPPORTED_CLAIM_MARKERS = [
|
| 88 |
+
"sources say",
|
| 89 |
+
"experts believe",
|
| 90 |
+
"studies show",
|
| 91 |
+
"according to sources",
|
| 92 |
+
"rumor has it",
|
| 93 |
+
"allegedly",
|
| 94 |
+
"it is believed",
|
| 95 |
+
"some people say",
|
| 96 |
+
"many believe",
|
| 97 |
+
"reports suggest",
|
| 98 |
+
"anonymous sources",
|
| 99 |
+
"unnamed officials",
|
| 100 |
+
"insiders reveal",
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
# ── Hindi character range for language detection ────────────────────────────
|
| 104 |
+
HINDI_PATTERN = re.compile(r"[\u0900-\u097F]")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def preprocess_text(text: str) -> str:
|
| 108 |
+
"""Lowercase, remove punctuation, remove stopwords."""
|
| 109 |
+
text = text.lower()
|
| 110 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
| 111 |
+
tokens = text.split()
|
| 112 |
+
tokens = [t for t in tokens if t not in ENGLISH_STOPWORDS]
|
| 113 |
+
return " ".join(tokens)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def extract_keywords(text: str, top_n: int = 10) -> list[str]:
|
| 117 |
+
"""Return the most frequent non-stopword tokens."""
|
| 118 |
+
cleaned = preprocess_text(text)
|
| 119 |
+
tokens = cleaned.split()
|
| 120 |
+
freq: dict[str, int] = {}
|
| 121 |
+
for t in tokens:
|
| 122 |
+
if len(t) > 2:
|
| 123 |
+
freq[t] = freq.get(t, 0) + 1
|
| 124 |
+
sorted_tokens = sorted(freq, key=freq.get, reverse=True) # type: ignore
|
| 125 |
+
return sorted_tokens[:top_n]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
import re
|
| 130 |
+
|
| 131 |
+
import re
|
| 132 |
+
|
| 133 |
+
def build_search_query(text: str) -> str:
|
| 134 |
+
"""
|
| 135 |
+
Strips conversational filler, internet slang, and extracts the core claim for a laser-focused web search.
|
| 136 |
+
"""
|
| 137 |
+
# 1. Massive list of conversational filler, clickbait, and Gen Z slang phrases
|
| 138 |
+
fillers = [
|
| 139 |
+
# News/WhatsApp filler
|
| 140 |
+
"is it true that", "i heard that", "someone told me", "can you check if",
|
| 141 |
+
"they are saying", "breaking news", "shocking", "whatsapp forward",
|
| 142 |
+
"forwarded as received", "please verify", "pls verify", "can you verify",
|
| 143 |
+
"fact check this", "tell me if", "did you hear", "rumor has it",
|
| 144 |
+
"watch till the end", "viral video", "secret exposed", "must watch",
|
| 145 |
+
"mind blowing", "i read somewhere", "is this real", "is this fake",
|
| 146 |
+
"check this news", "verify this claim", "you won't believe",
|
| 147 |
+
"alert:", "warning:", "urgent:", "fwd:", "bro is it true", "bhau tell me",
|
| 148 |
+
|
| 149 |
+
# Gen Z / Internet Slang Phrases
|
| 150 |
+
"no cap", "fr fr", "on god", "spill the tea", "is it giving",
|
| 151 |
+
"big yikes", "to be honest", "not gonna lie", "out of pocket",
|
| 152 |
+
"let him cook", "make it make sense", "rent free", "touch grass",
|
| 153 |
+
"caught in 4k", "main character energy", "pop off", "periodt",
|
| 154 |
+
"for real", "deadass", "lowkey", "highkey", "tbh", "ngl", "chat is this real",
|
| 155 |
+
"make it viral"
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
clean_text = text.lower()
|
| 159 |
+
for filler in fillers:
|
| 160 |
+
clean_text = clean_text.replace(filler, " ")
|
| 161 |
+
|
| 162 |
+
# 2. Keep only alphanumeric words
|
| 163 |
+
words = re.findall(r'\b\w+\b', clean_text)
|
| 164 |
+
|
| 165 |
+
# 3. Comprehensive English Stop Words + Gen Z "Brainrot" Dictionary
|
| 166 |
+
stop_words = {
|
| 167 |
+
# Standard English NLP Stop Words
|
| 168 |
+
"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
|
| 169 |
+
"yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
|
| 170 |
+
"her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
|
| 171 |
+
"theirs", "themselves", "what", "which", "who", "whom", "this", "that",
|
| 172 |
+
"these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
|
| 173 |
+
"have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
|
| 174 |
+
"the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
|
| 175 |
+
"at", "by", "for", "with", "about", "against", "between", "into", "through",
|
| 176 |
+
"during", "before", "after", "above", "below", "to", "from", "up", "down",
|
| 177 |
+
"in", "out", "on", "off", "over", "under", "again", "further", "then",
|
| 178 |
+
"once", "here", "there", "when", "where", "why", "how", "all", "any",
|
| 179 |
+
"both", "each", "few", "more", "most", "other", "some", "such", "no",
|
| 180 |
+
"nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
|
| 181 |
+
"t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o",
|
| 182 |
+
"re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn",
|
| 183 |
+
"haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn",
|
| 184 |
+
"wasn", "weren", "won", "wouldn", "tell", "know", "think", "believe",
|
| 185 |
+
"say", "said", "saying", "ask", "asked", "check", "news", "today", "new",
|
| 186 |
+
|
| 187 |
+
# Gen Z / Internet Slang Single Words
|
| 188 |
+
"fr", "cap", "bruh", "bro", "dude", "rn", "skibidi", "rizz", "sigma",
|
| 189 |
+
"bet", "af", "smh", "idk", "idc", "lmao", "lmfao", "lol", "rofl", "omg",
|
| 190 |
+
"sus", "legit", "bussin", "yall", "based", "cringe", "ratio", "gyatt",
|
| 191 |
+
"mewing", "lit", "fire", "tea", "dub", "flop", "iykyk", "literally",
|
| 192 |
+
"actually", "basically", "seriously", "like", "yap", "yapping",
|
| 193 |
+
"delulu", "solulu", "pookie", "aura", "chat", "fyi", "lmk", "tldr"
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
# Filter out the stop words and slang
|
| 197 |
+
core_keywords = [word for word in words if word not in stop_words]
|
| 198 |
+
|
| 199 |
+
# 4. Limit to top 8 keywords so Google News doesn't get overwhelmed
|
| 200 |
+
final_query = " ".join(core_keywords[:8])
|
| 201 |
+
|
| 202 |
+
# Fallback just in case they typed nothing but slang/stop words
|
| 203 |
+
return final_query if final_query.strip() else text[:50]
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def detect_language(text: str) -> str:
|
| 207 |
+
"""Detect if text is primarily Hindi or English."""
|
| 208 |
+
hindi_chars = len(HINDI_PATTERN.findall(text))
|
| 209 |
+
total_alpha = sum(1 for c in text if c.isalpha())
|
| 210 |
+
if total_alpha == 0:
|
| 211 |
+
return "en"
|
| 212 |
+
if hindi_chars / total_alpha > 0.3:
|
| 213 |
+
return "hi"
|
| 214 |
+
return "en"
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def detect_suspicious_phrases(text: str) -> dict:
|
| 218 |
+
"""Scan text for clickbait, emotional, and unsupported-claim markers."""
|
| 219 |
+
lower = text.lower()
|
| 220 |
+
found_clickbait = [p for p in CLICKBAIT_PHRASES if p in lower]
|
| 221 |
+
found_emotional = [p for p in EMOTIONAL_PHRASES if p in lower]
|
| 222 |
+
found_unsupported = [p for p in UNSUPPORTED_CLAIM_MARKERS if p in lower]
|
| 223 |
+
total = len(found_clickbait) + len(found_emotional) + len(found_unsupported)
|
| 224 |
+
return {
|
| 225 |
+
"clickbait_phrases": found_clickbait,
|
| 226 |
+
"emotional_language": found_emotional,
|
| 227 |
+
"unsupported_claims": found_unsupported,
|
| 228 |
+
"total_suspicious_count": total,
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def tokenize(text: str) -> list[str]:
|
| 233 |
+
"""Simple whitespace + punctuation tokenizer."""
|
| 234 |
+
text = text.lower()
|
| 235 |
+
text = re.sub(r"[^\w\s]", " ", text)
|
| 236 |
+
return text.split()
|
requirements.txt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── VeriLens AI V2 Requirements ──────────────────────────────────────────────
|
| 2 |
+
|
| 3 |
+
# Web Server & API
|
| 4 |
+
fastapi==0.115.0
|
| 5 |
+
uvicorn[standard]==0.30.6
|
| 6 |
+
pydantic>=2.0.0
|
| 7 |
+
python-dotenv==1.0.1
|
| 8 |
+
httpx==0.27.2
|
| 9 |
+
|
| 10 |
+
# Modern Web Scraping (Replaces newspaper3k)
|
| 11 |
+
trafilatura>=1.12.0
|
| 12 |
+
lxml-html-clean==0.4.1
|
| 13 |
+
|
| 14 |
+
# Machine Learning & Transformers
|
| 15 |
+
torch==2.4.1
|
| 16 |
+
transformers==4.44.2
|
| 17 |
+
sentence-transformers==3.0.1
|
| 18 |
+
scikit-learn==1.5.1
|
| 19 |
+
numpy>=1.24.0
|
| 20 |
+
pandas>=2.0.0
|
| 21 |
+
|
| 22 |
+
# OS & Internet Tools
|
| 23 |
+
duckduckgo-search>=7.0.0
|
| 24 |
+
certifi
|
scraper.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Web Scraper for VeriLens AI (V2 - Trafilatura Engine)
|
| 3 |
+
Uses the modern trafilatura library to bypass bot-blockers,
|
| 4 |
+
strip out cookie banners, and extract pristine article text for NLP.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
|
| 12 |
+
import trafilatura
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class ScrapedArticle:
|
| 19 |
+
title: str
|
| 20 |
+
text: str
|
| 21 |
+
authors: list[str]
|
| 22 |
+
publish_date: str | None
|
| 23 |
+
source_url: str
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def extract_article(url: str) -> ScrapedArticle:
|
| 27 |
+
"""
|
| 28 |
+
Download and parse a news article from *url* using Trafilatura.
|
| 29 |
+
Raises ValueError on failure or if the site aggressively blocks scraping.
|
| 30 |
+
"""
|
| 31 |
+
logger.info(f"Attempting to scrape URL: {url}")
|
| 32 |
+
|
| 33 |
+
# 1. Fetch the raw HTML (Trafilatura handles redirects and headers automatically)
|
| 34 |
+
downloaded = trafilatura.fetch_url(url)
|
| 35 |
+
|
| 36 |
+
if downloaded is None:
|
| 37 |
+
logger.error(f"Fetch failed for {url}. The site may be down or actively blocking bots.")
|
| 38 |
+
raise ValueError("Could not access URL. The site may be blocking automated requests or is invalid.")
|
| 39 |
+
|
| 40 |
+
# 2. Extract the text and metadata (bare_extraction returns a dictionary)
|
| 41 |
+
# We disable comments and tables to keep the text as pure as possible for the AI.
|
| 42 |
+
extracted = trafilatura.bare_extraction(
|
| 43 |
+
downloaded,
|
| 44 |
+
include_comments=False,
|
| 45 |
+
include_tables=False
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# 3. Guardrail: Did we actually get text?
|
| 49 |
+
if extracted is None or not extracted.get('text') or len(extracted.get('text', '').strip()) < 50:
|
| 50 |
+
logger.warning(f"Extraction failed or returned too little text for {url}")
|
| 51 |
+
raise ValueError(
|
| 52 |
+
"Extracted article content is too short or empty. "
|
| 53 |
+
"The URL may be a video, a paywalled article, or heavily obfuscated with JavaScript."
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# 4. Clean up the metadata
|
| 57 |
+
title = extracted.get('title') or "Unknown Title"
|
| 58 |
+
text = extracted.get('text', '')
|
| 59 |
+
date = extracted.get('date')
|
| 60 |
+
|
| 61 |
+
# Trafilatura usually returns authors as a single string separated by semicolons or commas
|
| 62 |
+
raw_author = extracted.get('author')
|
| 63 |
+
if raw_author:
|
| 64 |
+
# Split by comma or semicolon and clean up whitespace
|
| 65 |
+
authors = [a.strip() for a in raw_author.replace(';', ',').split(',') if a.strip()]
|
| 66 |
+
else:
|
| 67 |
+
authors = []
|
| 68 |
+
|
| 69 |
+
logger.info(f"Successfully scraped: '{title[:30]}...' ({len(text)} characters)")
|
| 70 |
+
|
| 71 |
+
return ScrapedArticle(
|
| 72 |
+
title=title,
|
| 73 |
+
text=text,
|
| 74 |
+
authors=authors,
|
| 75 |
+
publish_date=date,
|
| 76 |
+
source_url=url,
|
| 77 |
+
)
|
trained_model_v2/config.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "distilbert-base-uncased",
|
| 3 |
+
"activation": "gelu",
|
| 4 |
+
"architectures": [
|
| 5 |
+
"DistilBertForSequenceClassification"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0.1,
|
| 8 |
+
"dim": 768,
|
| 9 |
+
"dropout": 0.1,
|
| 10 |
+
"hidden_dim": 3072,
|
| 11 |
+
"id2label": {
|
| 12 |
+
"0": "FAKE",
|
| 13 |
+
"1": "REAL"
|
| 14 |
+
},
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"label2id": {
|
| 17 |
+
"FAKE": 0,
|
| 18 |
+
"REAL": 1
|
| 19 |
+
},
|
| 20 |
+
"max_position_embeddings": 512,
|
| 21 |
+
"model_type": "distilbert",
|
| 22 |
+
"n_heads": 12,
|
| 23 |
+
"n_layers": 6,
|
| 24 |
+
"pad_token_id": 0,
|
| 25 |
+
"problem_type": "single_label_classification",
|
| 26 |
+
"qa_dropout": 0.1,
|
| 27 |
+
"seq_classif_dropout": 0.2,
|
| 28 |
+
"sinusoidal_pos_embds": false,
|
| 29 |
+
"tie_weights_": true,
|
| 30 |
+
"torch_dtype": "float32",
|
| 31 |
+
"transformers_version": "4.44.2",
|
| 32 |
+
"vocab_size": 30522
|
| 33 |
+
}
|
trained_model_v2/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88d5997db34a6989bc93d791e3f16f0e8a330b449f3cab3bc064057bd9e1e2d3
|
| 3 |
+
size 267832560
|
trained_model_v2/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
trained_model_v2/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
trained_model_v2/tokenizer_config.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": true,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_lower_case": true,
|
| 47 |
+
"mask_token": "[MASK]",
|
| 48 |
+
"model_max_length": 512,
|
| 49 |
+
"pad_token": "[PAD]",
|
| 50 |
+
"sep_token": "[SEP]",
|
| 51 |
+
"strip_accents": null,
|
| 52 |
+
"tokenize_chinese_chars": true,
|
| 53 |
+
"tokenizer_class": "DistilBertTokenizer",
|
| 54 |
+
"unk_token": "[UNK]"
|
| 55 |
+
}
|
trained_model_v2/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
verifier.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Internet Verifier for VeriLens AI
|
| 3 |
+
- Searches the web via Google News RSS for live, rate-limit-proof verification.
|
| 4 |
+
- Searches Wikipedia API for historical fact verification.
|
| 5 |
+
- Computes strict semantic entailment using a Cross-Encoder.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
import urllib.request
|
| 10 |
+
import urllib.parse
|
| 11 |
+
import xml.etree.ElementTree as ET
|
| 12 |
+
import re
|
| 13 |
+
import json # <-- Added for Wikipedia API
|
| 14 |
+
import numpy as np # <-- Added for softmax over NLI logits
|
| 15 |
+
|
| 16 |
+
import asyncio
|
| 17 |
+
import logging
|
| 18 |
+
from dataclasses import dataclass, field
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# ── Lazy-loaded Cross-Encoder ────────────────────────────────────────
|
| 23 |
+
_cross_model = None
|
| 24 |
+
|
| 25 |
+
def _get_cross_model():
|
| 26 |
+
global _cross_model
|
| 27 |
+
if _cross_model is None:
|
| 28 |
+
try:
|
| 29 |
+
from sentence_transformers import CrossEncoder
|
| 30 |
+
logger.info("Loading Multilingual NLI Cross-Encoder model…")
|
| 31 |
+
# ⚡ Multilingual mDeBERTa — supports 100+ languages for global claim verification
|
| 32 |
+
# Label order: [entailment=0, neutral=1, contradiction=2]
|
| 33 |
+
_cross_model = CrossEncoder("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
|
| 34 |
+
logger.info("Multilingual NLI Cross-Encoder loaded successfully.")
|
| 35 |
+
except Exception as exc:
|
| 36 |
+
logger.warning("Could not load NLI Cross-Encoder: %s", exc)
|
| 37 |
+
return _cross_model
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class SourceArticle:
|
| 42 |
+
title: str
|
| 43 |
+
url: str
|
| 44 |
+
snippet: str
|
| 45 |
+
trust: str = "medium" # "high", "medium", "low"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class VerificationResult:
|
| 50 |
+
similarity_score: float = 0.0
|
| 51 |
+
sources: list[SourceArticle] = field(default_factory=list)
|
| 52 |
+
verified: bool = False
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ── Trusted domains (Expanded Global & Indian Scope) ───────────────────────
|
| 56 |
+
HIGH_TRUST_DOMAINS = {
|
| 57 |
+
"wikipedia.org", # <-- Added Wikipedia as a Ground Truth Source
|
| 58 |
+
|
| 59 |
+
# 🌍 Global Wire Services (The original sources of most news)
|
| 60 |
+
"reuters.com", "apnews.com", "bloomberg.com", "afp.com", "upi.com",
|
| 61 |
+
|
| 62 |
+
# 🇺🇸/🇬🇧 Major US, UK & International Media
|
| 63 |
+
"bbc.com", "bbc.co.uk", "nytimes.com", "washingtonpost.com", "wsj.com",
|
| 64 |
+
"theguardian.com", "npr.org", "pbs.org", "cnn.com", "ft.com",
|
| 65 |
+
"aljazeera.com", "dw.com", "france24.com", "scmp.com", "nbcnews.com",
|
| 66 |
+
"cbsnews.com", "abcnews.go.com", "theatlantic.com", "time.com", "economist.com",
|
| 67 |
+
|
| 68 |
+
# 🇮🇳 Indian National & Regional Heavyweights
|
| 69 |
+
"thehindu.com", "hindustantimes.com", "indianexpress.com", "timesofindia.indiatimes.com",
|
| 70 |
+
"ndtv.com", "indiatoday.in", "theprint.in", "thewire.in", "scroll.in",
|
| 71 |
+
"livemint.com", "business-standard.com", "deccanherald.com", "telegraphindia.com",
|
| 72 |
+
"tribuneindia.com", "newindianexpress.com", "firstpost.com", "thequint.com",
|
| 73 |
+
"cnbctv18.com", "moneycontrol.com", "aninews.in", "ptinews.com", "freepressjournal.in",
|
| 74 |
+
|
| 75 |
+
# 🔎 Dedicated Fact-Checkers (Massive Trust Boost if found)
|
| 76 |
+
"snopes.com", "politifact.com", "factcheck.org", "altnews.in", "boomlive.in",
|
| 77 |
+
"newschecker.in", "vishvasnews.com", "smhoaxinvestigator.com", "factchecker.in",
|
| 78 |
+
|
| 79 |
+
# 🌐 High-Trust Aggregators
|
| 80 |
+
"yahoo.com/news", "msn.com", "news.google.com"
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# ── Low Trust / Disinformation / Satire domains ────────────────────────────
|
| 84 |
+
LOW_TRUST_DOMAINS = {
|
| 85 |
+
# ⚠️ Known Fake News, Pseudoscience & Conspiracy
|
| 86 |
+
"infowars.com", "naturalnews.com", "beforeitsnews.com", "thegatewaypundit.com",
|
| 87 |
+
"zerohedge.com", "worldnewsdailyreport.com", "nationalreport.net",
|
| 88 |
+
|
| 89 |
+
# 📢 State-Sponsored Propaganda
|
| 90 |
+
"rt.com", "sputniknews.com", "globaltimes.cn",
|
| 91 |
+
|
| 92 |
+
# 🇮🇳 Indian High-Bias / Frequently Flagged for Disinformation
|
| 93 |
+
"postcard.news", "opindia.com", "tfipost.com", "kreately.in", "rightlog.in",
|
| 94 |
+
|
| 95 |
+
# 🤡 Satire (If your engine matches these, the news is definitely fake)
|
| 96 |
+
"theonion.com", "babylonbee.com", "fakingnews.com", "thefauxy.com",
|
| 97 |
+
"thedailymash.co.uk", "waterfordwhispersnews.com", "clickhole.com"
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _trust_level(url: str, snippet: str = "", title: str = "") -> str:
|
| 102 |
+
"""Evaluates trust based on URL domain AND snippet/title signatures."""
|
| 103 |
+
lower_url = url.lower()
|
| 104 |
+
lower_snippet = snippet.lower()
|
| 105 |
+
lower_title = title.lower()
|
| 106 |
+
|
| 107 |
+
# 1. Check URL Domains
|
| 108 |
+
for d in HIGH_TRUST_DOMAINS:
|
| 109 |
+
if d in lower_url:
|
| 110 |
+
return "high"
|
| 111 |
+
|
| 112 |
+
# 2. Check snippet OR title for major syndicated wire services
|
| 113 |
+
high_trust_keywords = ["reuters", "associated press", "bbc", "cnn", "the new york times", "bloomberg"]
|
| 114 |
+
for keyword in high_trust_keywords:
|
| 115 |
+
if keyword in lower_snippet or keyword in lower_title:
|
| 116 |
+
return "high"
|
| 117 |
+
|
| 118 |
+
# 3. Check for known low-trust/satire sites
|
| 119 |
+
for d in LOW_TRUST_DOMAINS:
|
| 120 |
+
if d in lower_url:
|
| 121 |
+
return "low"
|
| 122 |
+
|
| 123 |
+
return "medium"
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# ── Locale detection for multilingual search ─────────────────────────────
|
| 127 |
+
_LOCALE_MAP = {
|
| 128 |
+
(0x0900, 0x097F): ('hi', 'IN'), # Devanagari → Hindi
|
| 129 |
+
(0x0980, 0x09FF): ('bn', 'IN'), # Bengali
|
| 130 |
+
(0x0A00, 0x0A7F): ('pa', 'IN'), # Gurmukhi → Punjabi
|
| 131 |
+
(0x0A80, 0x0AFF): ('gu', 'IN'), # Gujarati
|
| 132 |
+
(0x0B80, 0x0BFF): ('ta', 'IN'), # Tamil
|
| 133 |
+
(0x0C00, 0x0C7F): ('te', 'IN'), # Telugu
|
| 134 |
+
(0x0C80, 0x0CFF): ('kn', 'IN'), # Kannada
|
| 135 |
+
(0x0D00, 0x0D7F): ('ml', 'IN'), # Malayalam
|
| 136 |
+
(0x0600, 0x06FF): ('ar', 'AE'), # Arabic
|
| 137 |
+
(0x4E00, 0x9FFF): ('zh', 'CN'), # CJK → Chinese
|
| 138 |
+
(0x3040, 0x30FF): ('ja', 'JP'), # Hiragana/Katakana → Japanese
|
| 139 |
+
(0xAC00, 0xD7AF): ('ko', 'KR'), # Hangul → Korean
|
| 140 |
+
(0x0400, 0x04FF): ('ru', 'RU'), # Cyrillic → Russian
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _detect_locale(query: str) -> tuple[str, str]:
|
| 145 |
+
"""Detect (lang, country) from the Unicode script of the first non-ASCII char."""
|
| 146 |
+
for c in query:
|
| 147 |
+
cp = ord(c)
|
| 148 |
+
for (lo, hi), locale in _LOCALE_MAP.items():
|
| 149 |
+
if lo <= cp <= hi:
|
| 150 |
+
return locale
|
| 151 |
+
return ('en', 'US') # default to English
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _fetch_google_rss(url: str, num_results: int) -> list[dict]:
|
| 155 |
+
"""Fetch and parse a Google News RSS URL into a list of result dicts."""
|
| 156 |
+
print(f" 🌐 GOOGLE NEWS URL: {url}")
|
| 157 |
+
req = urllib.request.Request(
|
| 158 |
+
url,
|
| 159 |
+
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)'}
|
| 160 |
+
)
|
| 161 |
+
with urllib.request.urlopen(req, timeout=10) as response:
|
| 162 |
+
xml_data = response.read()
|
| 163 |
+
root = ET.fromstring(xml_data)
|
| 164 |
+
results = []
|
| 165 |
+
for item in root.findall('.//item')[:num_results]:
|
| 166 |
+
title = item.find('title')
|
| 167 |
+
link = item.find('link')
|
| 168 |
+
title_text = title.text if title is not None else ""
|
| 169 |
+
link_text = link.text if link is not None else ""
|
| 170 |
+
desc = item.find('description')
|
| 171 |
+
desc_html = desc.text if desc is not None else ""
|
| 172 |
+
snippet = re.sub('<[^<]+>', '', desc_html)
|
| 173 |
+
results.append({"title": title_text, "href": link_text, "body": snippet})
|
| 174 |
+
print(f" 📰 Results found: {len(results)}")
|
| 175 |
+
return results
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _google_news_search(query: str, num_results: int = 8) -> list[dict]:
|
| 179 |
+
"""
|
| 180 |
+
Multilingual Google News RSS search.
|
| 181 |
+
1. Detect locale from query script (Hindi→hi/IN, Bengali→bn/IN, etc.)
|
| 182 |
+
2. Search with detected locale
|
| 183 |
+
3. Fallback: search with no locale (Google auto-detects)
|
| 184 |
+
4. Fallback: slice to first 6 words and retry
|
| 185 |
+
"""
|
| 186 |
+
try:
|
| 187 |
+
safe_query = urllib.parse.quote(query)
|
| 188 |
+
lang, country = _detect_locale(query)
|
| 189 |
+
|
| 190 |
+
print(f"\n{'='*50}")
|
| 191 |
+
print(f"🔍 GOOGLE NEWS SEARCH")
|
| 192 |
+
print(f" Query: {query[:80]}{'...' if len(query) > 80 else ''}")
|
| 193 |
+
print(f" Detected locale: hl={lang}, gl={country}")
|
| 194 |
+
|
| 195 |
+
# Attempt 1: Search with detected locale
|
| 196 |
+
url = f"https://news.google.com/rss/search?q={safe_query}&hl={lang}&gl={country}&ceid={country}:{lang}"
|
| 197 |
+
results = _fetch_google_rss(url, num_results)
|
| 198 |
+
|
| 199 |
+
# Attempt 2: No locale params → let Google infer
|
| 200 |
+
if not results:
|
| 201 |
+
print(" ⚠️ Zero results. Retrying with no locale params...")
|
| 202 |
+
url_nolang = f"https://news.google.com/rss/search?q={safe_query}"
|
| 203 |
+
results = _fetch_google_rss(url_nolang, num_results)
|
| 204 |
+
|
| 205 |
+
# Attempt 3: Query slicing → first 6 words only
|
| 206 |
+
if not results:
|
| 207 |
+
words = query.split()
|
| 208 |
+
if len(words) > 4:
|
| 209 |
+
short_query = " ".join(words[:6])
|
| 210 |
+
safe_short = urllib.parse.quote(short_query)
|
| 211 |
+
print(f" ⚠️ Still zero. Slicing to 6 words: '{short_query}'")
|
| 212 |
+
url_short = f"https://news.google.com/rss/search?q={safe_short}&hl={lang}&gl={country}&ceid={country}:{lang}"
|
| 213 |
+
results = _fetch_google_rss(url_short, num_results)
|
| 214 |
+
|
| 215 |
+
print(f" ✅ Final result count: {len(results)}")
|
| 216 |
+
print(f"{'='*50}\n")
|
| 217 |
+
return results
|
| 218 |
+
|
| 219 |
+
except Exception as exc:
|
| 220 |
+
logger.error("Google News search failed: %s", exc)
|
| 221 |
+
return []
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def _wikipedia_search(query: str) -> list[dict]:
|
| 225 |
+
"""
|
| 226 |
+
Multilingual Wikipedia search.
|
| 227 |
+
Tries English first, then falls back to the language-specific edition
|
| 228 |
+
if the query contains non-ASCII characters.
|
| 229 |
+
"""
|
| 230 |
+
def _wiki_query(wiki_lang: str, q: str) -> list[dict]:
|
| 231 |
+
safe_query = urllib.parse.quote(q)
|
| 232 |
+
url = f"https://{wiki_lang}.wikipedia.org/w/api.php?action=query&list=search&srsearch={safe_query}&utf8=&format=json"
|
| 233 |
+
print(f" 📚 WIKIPEDIA URL ({wiki_lang}): {url[:120]}...")
|
| 234 |
+
req = urllib.request.Request(
|
| 235 |
+
url,
|
| 236 |
+
headers={'User-Agent': 'VeriLensAI/1.0 (University Fact-Checking Project)'}
|
| 237 |
+
)
|
| 238 |
+
with urllib.request.urlopen(req, timeout=10) as response:
|
| 239 |
+
data = json.loads(response.read().decode())
|
| 240 |
+
results = []
|
| 241 |
+
for item in data.get('query', {}).get('search', [])[:2]:
|
| 242 |
+
title = item['title']
|
| 243 |
+
clean_snippet = re.sub('<[^<]+>', '', item['snippet'])
|
| 244 |
+
results.append({
|
| 245 |
+
"title": f"{title} - Wikipedia",
|
| 246 |
+
"href": f"https://{wiki_lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ', '_'))}",
|
| 247 |
+
"body": clean_snippet
|
| 248 |
+
})
|
| 249 |
+
print(f" 📚 Wikipedia ({wiki_lang}) results: {len(results)}")
|
| 250 |
+
return results
|
| 251 |
+
|
| 252 |
+
try:
|
| 253 |
+
# 1. Try English Wikipedia first
|
| 254 |
+
results = _wiki_query('en', query)
|
| 255 |
+
|
| 256 |
+
# 2. If 0 results and query contains non-ASCII, detect language Wikipedia
|
| 257 |
+
if not results and any(ord(c) > 127 for c in query):
|
| 258 |
+
detected_lang, _ = _detect_locale(query)
|
| 259 |
+
if detected_lang != 'en':
|
| 260 |
+
logger.info(f"Retrying Wikipedia with lang={detected_lang} for non-ASCII query")
|
| 261 |
+
results = _wiki_query(detected_lang, query)
|
| 262 |
+
|
| 263 |
+
return results
|
| 264 |
+
except Exception as exc:
|
| 265 |
+
logger.error("Wikipedia search failed: %s", exc)
|
| 266 |
+
return []
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
async def _search_web(query: str, num_results: int = 8) -> list[dict]:
|
| 270 |
+
"""Search the web for news AND historical facts concurrently, with short-query fallback."""
|
| 271 |
+
|
| 272 |
+
# Run Google News and Wikipedia at the exact same time
|
| 273 |
+
news_task = asyncio.to_thread(_google_news_search, query, num_results)
|
| 274 |
+
wiki_task = asyncio.to_thread(_wikipedia_search, query)
|
| 275 |
+
|
| 276 |
+
# Wait for both to finish
|
| 277 |
+
news_results, wiki_results = await asyncio.gather(news_task, wiki_task)
|
| 278 |
+
|
| 279 |
+
# Allocate half the quota to each source to ensure balanced verification
|
| 280 |
+
half_quota = num_results // 2
|
| 281 |
+
balanced_results = news_results[:half_quota] + wiki_results[:num_results - half_quota]
|
| 282 |
+
|
| 283 |
+
# If Wiki returned fewer results than its quota, fill the gap with more news
|
| 284 |
+
if len(balanced_results) < num_results:
|
| 285 |
+
remaining_slots = num_results - len(balanced_results)
|
| 286 |
+
balanced_results.extend(news_results[half_quota:half_quota + remaining_slots])
|
| 287 |
+
|
| 288 |
+
# 🔄 SHORT-QUERY FALLBACK: If 0 results, retry with just the first 6 words
|
| 289 |
+
if not balanced_results:
|
| 290 |
+
words = query.split()
|
| 291 |
+
if len(words) > 4:
|
| 292 |
+
short_query = " ".join(words[:6])
|
| 293 |
+
logger.info(f"Zero results for full query. Retrying with short query: '{short_query}'")
|
| 294 |
+
news_task2 = asyncio.to_thread(_google_news_search, short_query, num_results)
|
| 295 |
+
wiki_task2 = asyncio.to_thread(_wikipedia_search, short_query)
|
| 296 |
+
news2, wiki2 = await asyncio.gather(news_task2, wiki_task2)
|
| 297 |
+
balanced_results = news2[:half_quota] + wiki2[:num_results - half_quota]
|
| 298 |
+
if len(balanced_results) < num_results:
|
| 299 |
+
remaining_slots = num_results - len(balanced_results)
|
| 300 |
+
balanced_results.extend(news2[half_quota:half_quota + remaining_slots])
|
| 301 |
+
|
| 302 |
+
return balanced_results
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
# NLI Entailment threshold — much stricter than old STS similarity.
|
| 306 |
+
# Only sources whose articles genuinely ENTAIL the claim will pass.
|
| 307 |
+
MIN_RELEVANCE_THRESHOLD = 0.75
|
| 308 |
+
|
| 309 |
+
# Label mapping for MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7
|
| 310 |
+
# Index 0 = Entailment, Index 1 = Neutral, Index 2 = Contradiction
|
| 311 |
+
_NLI_ENTAILMENT_IDX = 0
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def _softmax(logits: np.ndarray) -> np.ndarray:
|
| 315 |
+
"""Numerically-stable softmax over the last axis."""
|
| 316 |
+
exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
|
| 317 |
+
return exp / np.sum(exp, axis=-1, keepdims=True)
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
def _compute_per_source_similarity(text: str, snippets: list[str]) -> list[float]:
|
| 321 |
+
"""
|
| 322 |
+
Compute strict semantic entailment using an NLI Cross-Encoder.
|
| 323 |
+
|
| 324 |
+
The model outputs raw logits for [Contradiction, Entailment, Neutral].
|
| 325 |
+
We apply softmax and return the Entailment probability (0.0 → 1.0)
|
| 326 |
+
so that keyword-overlap alone can no longer fool the system.
|
| 327 |
+
"""
|
| 328 |
+
model = _get_cross_model()
|
| 329 |
+
if model is None or not snippets:
|
| 330 |
+
return [0.0] * len(snippets)
|
| 331 |
+
|
| 332 |
+
try:
|
| 333 |
+
# Cross-Encoders take PAIRS: (premise=article, hypothesis=claim)
|
| 334 |
+
pairs = [[snippet[:512], text[:512]] for snippet in snippets]
|
| 335 |
+
|
| 336 |
+
# NLI models return raw logits of shape (N, 3)
|
| 337 |
+
logits = model.predict(pairs)
|
| 338 |
+
logits = np.array(logits)
|
| 339 |
+
|
| 340 |
+
# Ensure 2-D even for a single pair
|
| 341 |
+
if logits.ndim == 1:
|
| 342 |
+
logits = logits.reshape(1, -1)
|
| 343 |
+
|
| 344 |
+
# Softmax → probabilities, then grab the Entailment column
|
| 345 |
+
probs = _softmax(logits)
|
| 346 |
+
entailment_scores = probs[:, _NLI_ENTAILMENT_IDX]
|
| 347 |
+
|
| 348 |
+
return [float(s) for s in entailment_scores]
|
| 349 |
+
except Exception as exc:
|
| 350 |
+
logger.error("NLI entailment computation failed: %s", exc)
|
| 351 |
+
return [0.0] * len(snippets)
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
async def verify_claim(text: str, search_query: str) -> VerificationResult:
|
| 355 |
+
"""
|
| 356 |
+
Search the internet for articles related to *search_query*,
|
| 357 |
+
compute per-source semantic entailment, and discard irrelevant results.
|
| 358 |
+
"""
|
| 359 |
+
items = await _search_web(search_query)
|
| 360 |
+
|
| 361 |
+
if not items:
|
| 362 |
+
return VerificationResult(similarity_score=0.0, sources=[], verified=False)
|
| 363 |
+
|
| 364 |
+
# Build candidate lists
|
| 365 |
+
candidates: list[SourceArticle] = []
|
| 366 |
+
snippets: list[str] = []
|
| 367 |
+
|
| 368 |
+
# 🔥 THE FIX: Removed the [:8] slice so Wikipedia actually gets processed!
|
| 369 |
+
for item in items:
|
| 370 |
+
title = item.get("title", "")
|
| 371 |
+
link = item.get("url", "") or item.get("href", "")
|
| 372 |
+
snippet = item.get("body", "")
|
| 373 |
+
|
| 374 |
+
candidates.append(
|
| 375 |
+
SourceArticle(
|
| 376 |
+
title=title,
|
| 377 |
+
url=link,
|
| 378 |
+
snippet=snippet,
|
| 379 |
+
trust=_trust_level(url=link, snippet=snippet, title=title),
|
| 380 |
+
)
|
| 381 |
+
)
|
| 382 |
+
snippets.append(f"{title}. {snippet}")
|
| 383 |
+
|
| 384 |
+
# Compute per-source similarity scores using the new Cross-Encoder
|
| 385 |
+
scores = await asyncio.to_thread(_compute_per_source_similarity, text, snippets)
|
| 386 |
+
|
| 387 |
+
# Filter: only keep sources above the relevance threshold
|
| 388 |
+
sources: list[SourceArticle] = []
|
| 389 |
+
relevant_scores: list[float] = []
|
| 390 |
+
|
| 391 |
+
# 🔎 X-RAY VISION: Print the AI's exact math to the backend terminal
|
| 392 |
+
print("\n" + "="*50)
|
| 393 |
+
print("🧠 CROSS-ENCODER SCORES:")
|
| 394 |
+
|
| 395 |
+
for candidate, score in zip(candidates, scores):
|
| 396 |
+
print(f"Score: {score:.3f} | Source: {candidate.url}")
|
| 397 |
+
|
| 398 |
+
# 🏛️ THE WIKIPEDIA VIP PASS 🏛️
|
| 399 |
+
if "wikipedia.org" in candidate.url:
|
| 400 |
+
required_score = 0.45 # Lower bar for encyclopedic context, but high enough to reject noise
|
| 401 |
+
else:
|
| 402 |
+
required_score = MIN_RELEVANCE_THRESHOLD # 0.75 strict NLI entailment for news
|
| 403 |
+
|
| 404 |
+
if score >= required_score:
|
| 405 |
+
sources.append(candidate)
|
| 406 |
+
relevant_scores.append(score)
|
| 407 |
+
print(f" -> ✅ ACCEPTED (Requires >= {required_score})")
|
| 408 |
+
else:
|
| 409 |
+
print(f" -> ❌ REJECTED (Requires >= {required_score})")
|
| 410 |
+
|
| 411 |
+
print("="*50 + "\n")
|
| 412 |
+
|
| 413 |
+
if not sources:
|
| 414 |
+
return VerificationResult(similarity_score=0.0, sources=[], verified=True)
|
| 415 |
+
|
| 416 |
+
avg_similarity = sum(relevant_scores) / len(relevant_scores)
|
| 417 |
+
|
| 418 |
+
return VerificationResult(
|
| 419 |
+
similarity_score=round(avg_similarity, 4),
|
| 420 |
+
sources=sources,
|
| 421 |
+
verified=True,
|
| 422 |
+
)
|