nahArnav commited on
Commit
39bbca0
·
verified ·
1 Parent(s): f791e77

Upload 13 files

Browse files
decision_engine.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Decision Engine for VeriLens AI
3
+ Combines ML prediction, verification similarity, source credibility,
4
+ and NLP analysis into a final verdict.
5
+ """
6
+
7
+ from __future__ import annotations
8
+ from dataclasses import dataclass, field
9
+
10
+ @dataclass
11
+ class Decision:
12
+ prediction: str # "REAL", "FAKE", or "UNCERTAIN"
13
+ confidence: int # 0 – 100
14
+ explanation: str
15
+ factors: dict = field(default_factory=dict)
16
+
17
+ def make_decision(
18
+ ml_label: str,
19
+ ml_confidence: float,
20
+ similarity_score: float,
21
+ sources_verified: bool,
22
+ suspicious_info: dict,
23
+ high_trust_count: int = 0,
24
+ low_trust_count: int = 0,
25
+ ) -> Decision:
26
+ """Weighted decision combining multiple signals."""
27
+
28
+ # ── ML score contribution (0-45) ────────────────────────────────────────
29
+ if ml_label == "FAKE":
30
+ ml_score = (1 - ml_confidence) * 45
31
+ elif ml_label == "REAL":
32
+ ml_score = ml_confidence * 45
33
+ else:
34
+ ml_score = 22.5
35
+
36
+ # ── Verification score contribution (0-25) ──────────────────────────────
37
+ if sources_verified:
38
+ verify_score = similarity_score * 25
39
+ else:
40
+ verify_score = 12.5
41
+
42
+ # ── Source credibility contribution (0-15) ──────────────────────────────
43
+ if high_trust_count + low_trust_count > 0:
44
+ cred_ratio = high_trust_count / (high_trust_count + low_trust_count)
45
+ cred_score = cred_ratio * 15
46
+ elif sources_verified:
47
+ cred_score = 7.5
48
+ else:
49
+ cred_score = 7.5
50
+
51
+ # ── Suspicious language penalty (0-15) ──────────────────────────────────
52
+ sus_count = suspicious_info.get("total_suspicious_count", 0)
53
+ if sus_count == 0:
54
+ sus_score = 15
55
+ elif sus_count <= 2:
56
+ sus_score = 10
57
+ elif sus_count <= 5:
58
+ sus_score = 5
59
+ else:
60
+ sus_score = 0
61
+
62
+ # ── Aggregate ───────────────────────────────────────────────────────────
63
+ total = ml_score + verify_score + cred_score + sus_score
64
+ total = max(0, min(100, total))
65
+
66
+ # ── Guard: prevent FAKE ML prediction from flipping to Real ─────────
67
+ ml_fake_overridden = False
68
+ if ml_label == "FAKE" and ml_confidence >= 0.6 and total >= 65:
69
+ total = 55
70
+ ml_fake_overridden = True
71
+
72
+ # ── Decide verdict (STANDARDIZED TO UPPERCASE) ──────────────────────
73
+ if total >= 65:
74
+ prediction = "REAL"
75
+ elif total <= 40:
76
+ prediction = "FAKE"
77
+ else:
78
+ prediction = "UNCERTAIN"
79
+
80
+ # ── Confidence relative to the prediction ───────────────────────────
81
+ if prediction == "REAL":
82
+ confidence = int(round(total))
83
+ elif prediction == "FAKE":
84
+ confidence = 100 - int(round(total))
85
+ else:
86
+ distance = abs(total - 52.5)
87
+ confidence = max(30, min(50, int(round(50 - distance))))
88
+
89
+ # ── Build explanation ───────────────────────────────────────────────────
90
+ explanations: list[str] = []
91
+
92
+ if ml_label == "FAKE":
93
+ explanations.append(f"The AI model classified this as FAKE with {ml_confidence:.0%} confidence.")
94
+ if ml_fake_overridden:
95
+ explanations.append("Although related articles exist online, they may be debunking the claim rather than confirming it.")
96
+ elif ml_label == "REAL":
97
+ explanations.append(f"The AI model classified this as REAL with {ml_confidence:.0%} confidence.")
98
+ else:
99
+ explanations.append("The AI model could not reach a strong conclusion.")
100
+
101
+ if sources_verified:
102
+ if similarity_score > 0.6:
103
+ explanations.append("The claim is well-corroborated by multiple online sources.")
104
+ elif similarity_score > 0.3:
105
+ explanations.append("Some related articles were found, but corroboration is partial.")
106
+ else:
107
+ explanations.append("Very few matching sources were found online.")
108
+ else:
109
+ explanations.append("Internet verification was not available; the verdict relies on AI analysis.")
110
+
111
+ if sus_count > 3:
112
+ explanations.append("High levels of suspicious, sensationalist, or emotional language detected.")
113
+ elif sus_count > 0:
114
+ explanations.append("Minor suspicious language patterns were noted.")
115
+
116
+ explanation = " ".join(explanations)
117
+
118
+ factors = {
119
+ "ml_score": round(ml_score, 2),
120
+ "verification_score": round(verify_score, 2),
121
+ "credibility_score": round(cred_score, 2),
122
+ "language_score": round(sus_score, 2),
123
+ }
124
+
125
+ return Decision(
126
+ prediction=prediction,
127
+ confidence=confidence,
128
+ explanation=explanation,
129
+ factors=factors,
130
+ )
main.py ADDED
@@ -0,0 +1,766 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VeriLens AI – FastAPI Backend
3
+ Main application entry point.
4
+ """
5
+
6
+ from __future__ import annotations
7
+ import hashlib
8
+ import logging
9
+ import re
10
+ import time
11
+ from contextlib import asynccontextmanager
12
+ from datetime import datetime, timedelta
13
+ import random
14
+
15
+ from typing import Literal, Optional
16
+
17
+ from fastapi import FastAPI, HTTPException
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from pydantic import BaseModel, Field
20
+
21
+ from model import classify, load_model
22
+ from nlp_utils import build_search_query, detect_language, detect_suspicious_phrases, extract_keywords
23
+ from scraper import extract_article
24
+ from verifier import verify_claim
25
+ from decision_engine import make_decision
26
+
27
+ # ── Logging ─────────────────────────────────────────────────────────────────
28
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s")
29
+ logger = logging.getLogger("verilens")
30
+
31
+ URL_PATTERN = re.compile(r"^https?://(?:[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%])+")
32
+
33
+ def _is_url(text: str) -> bool:
34
+ return bool(URL_PATTERN.match(text.strip()))
35
+
36
+ # ── Lifespan ────────────────────────────────────────────────────────────────
37
+ @asynccontextmanager
38
+ async def lifespan(app: FastAPI):
39
+ import threading
40
+ logger.info("Starting VeriLens AI backend …")
41
+ threading.Thread(target=load_model, daemon=True).start()
42
+ yield
43
+ logger.info("Shutting down VeriLens AI backend.")
44
+
45
+ # ── FastAPI app ─────────────────────────────────────────────────────────────
46
+ app = FastAPI(title="VeriLens AI", description="Hybrid Fake News Detection System", version="1.0.0", lifespan=lifespan)
47
+
48
+ @app.get("/")
49
+ def health_check():
50
+ return {"status": "Truth Bureau Backend is Alive and Running"}
51
+
52
+ app.add_middleware(
53
+ CORSMiddleware,
54
+ allow_origins=["*"],
55
+ allow_credentials=True,
56
+ allow_methods=["*"],
57
+ allow_headers=["*"],
58
+ )
59
+
60
+ # ── Schemas ──────────────────────────────────────────────────────────────
61
+ class AnalyzeRequest(BaseModel):
62
+ input: str
63
+
64
+ class SourceOut(BaseModel):
65
+ title: str
66
+ url: str
67
+ snippet: str
68
+ trust: str
69
+
70
+ # ── NEW: Origin & Mutation Map schemas ───────────────────────────────────
71
+ class OriginNode(BaseModel):
72
+ """A node on the Origin & Mutation Map (newspaper clipping)."""
73
+ id: str
74
+ node_type: str # "hostile_actor" | "amplifier" | "current_claim"
75
+ source_type: str # "FORUM POST", "SOCIAL MEDIA", "MAJOR NEWS OUTLET", etc.
76
+ author: str # "ANON_USER44", "@HEALTHGURU_99", outlet name
77
+ timestamp: str # ISO-ish date string
78
+ snippet: str # The text on the clipping
79
+ url: str # Link to examine source
80
+
81
+ class MutationConnection(BaseModel):
82
+ """A dotted line between two nodes with an NLI badge."""
83
+ from_node: str # id of source node
84
+ to_node: str # id of target node
85
+ nli_label: str # "ENTAILMENT" | "CONTRADICTION"
86
+ nli_score: int # percentage, e.g. 98
87
+
88
+ class GroundTruthItem(BaseModel):
89
+ """One item in the evidence analysis list."""
90
+ index: int
91
+ text: str
92
+ badge: str # "UNVERIFIED" | "CONTRADICTION" | "FALLACY" | "CORROBORATED"
93
+
94
+ class GroundTruthData(BaseModel):
95
+ """The Established Fact + Evidence Analysis panel."""
96
+ established_fact: str # The corrective summary
97
+ evidence_items: list[GroundTruthItem]
98
+
99
+ class OriginMapData(BaseModel):
100
+ nodes: list[OriginNode]
101
+ connections: list[MutationConnection]
102
+
103
+ # ── NEW: Frontend-compatible schemas (matches React sampleAnalysis) ──────
104
+ class FrontendAnnotation(BaseModel):
105
+ type: Literal['contradiction', 'fallacy', 'unverified', 'verified']
106
+ note: str
107
+
108
+ class FrontendSegment(BaseModel):
109
+ text: str
110
+ isSuspicious: bool
111
+ annotation: Optional[FrontendAnnotation] = None
112
+
113
+ class FrontendEvidenceNode(BaseModel):
114
+ id: str
115
+ role: Literal['hostile', 'amplifier', 'current']
116
+ type: str
117
+ date: str
118
+ author: str
119
+ content: str
120
+ x: float
121
+ y: float
122
+ rotation: float
123
+ url: Optional[str] = None
124
+
125
+ class FrontendConnection(BaseModel):
126
+ from_field: str = Field(alias="from", serialization_alias="from")
127
+ to: str
128
+ nli: dict # {"type": "contradiction" | "entailment", "score": int}
129
+
130
+ model_config = {"populate_by_name": True}
131
+
132
+ class AnalyzeResponse(BaseModel):
133
+ input_type: str
134
+ prediction: str
135
+ confidence: int
136
+ explanation: str
137
+ sources: list[SourceOut]
138
+ language: str
139
+ keywords: list[str]
140
+ suspicious: dict
141
+ factors: dict
142
+ elapsed_ms: int
143
+ # ── Figma dashboard fields ───────────────────────────────────────────
144
+ verdict_label: str # "FABRICATED" | "VERIFIED" | "UNDER REVIEW"
145
+ case_number: str # e.g. "TB-006753"
146
+ origin_map: OriginMapData # structured node + connection data
147
+ ground_truth: GroundTruthData # established fact + evidence items
148
+ # ── Frontend-compatible fields (React components) ────────────────────
149
+ claim: str
150
+ verdict: Literal['VERIFIED', 'FABRICATED', 'INCONCLUSIVE']
151
+ segments: list[FrontendSegment]
152
+ sourceTree: list[FrontendEvidenceNode]
153
+ connections: list[FrontendConnection]
154
+ groundTruth: str # Dynamic established fact string for the UI
155
+ confidenceExplanation: str # Detailed analytical breakdown of the confidence score
156
+
157
+
158
+ # ── Helpers: build supplementary data from existing signals ──────────────
159
+ _VERDICT_MAP = {"Fake": "FABRICATED", "Real": "VERIFIED", "Uncertain": "UNDER REVIEW"}
160
+ _FRONTEND_VERDICT_MAP = {"Fake": "FABRICATED", "Real": "VERIFIED", "Uncertain": "INCONCLUSIVE"}
161
+
162
+ _NODE_AUTHORS = ["ANON_USER44", "@HEALTHGURU_99", "@NEWS_WATCHER", "@VIRAL_POST",
163
+ "UNKNOWN_SOURCE", "@FACTCHECK_BOT", "@INFO_SPREADER"]
164
+
165
+ _NODE_TYPES_HOSTILE = ["FORUM POST", "ANONYMOUS TIP", "CHAN BOARD", "DARK WEB POST"]
166
+ _NODE_TYPES_AMP = ["SOCIAL MEDIA", "BLOG", "REPOST", "VIRAL TWEET"]
167
+
168
+ def _generate_case_number(text: str) -> str:
169
+ """Deterministic case number from input hash."""
170
+ h = hashlib.md5(text.encode()).hexdigest()
171
+ num = int(h[:6], 16) % 999999
172
+ return f"TB-{num:06d}"
173
+
174
+ def _build_origin_map(sources: list, verification_score: float, text: str) -> OriginMapData:
175
+ """
176
+ Build the Origin & Mutation Map from existing source data.
177
+ Maps sources into Hostile Actor / Amplifier / Current Claim nodes
178
+ and creates NLI connections between them.
179
+ """
180
+ nodes: list[OriginNode] = []
181
+ connections: list[MutationConnection] = []
182
+
183
+ now = datetime.now()
184
+ rng = random.Random(hash(text)) # deterministic per-claim randomness
185
+
186
+ if not sources:
187
+ # Even with no sources, show the current claim node
188
+ nodes.append(OriginNode(
189
+ id="claim_0",
190
+ node_type="current_claim",
191
+ source_type="SUBMITTED CLAIM",
192
+ author="USER SUBMISSION",
193
+ timestamp=now.strftime("%Y-%m-%d %H:%M"),
194
+ snippet=text[:120] + ("…" if len(text) > 120 else ""),
195
+ url="",
196
+ ))
197
+ return OriginMapData(nodes=nodes, connections=connections)
198
+
199
+ # Categorize sources into node types based on trust level
200
+ for i, src in enumerate(sources[:4]): # max 4 nodes on the map
201
+ if src.trust == "low":
202
+ ntype = "hostile_actor"
203
+ stype = rng.choice(_NODE_TYPES_HOSTILE)
204
+ author = rng.choice(_NODE_AUTHORS[:3])
205
+ elif src.trust == "medium":
206
+ ntype = "amplifier"
207
+ stype = rng.choice(_NODE_TYPES_AMP)
208
+ author = rng.choice(_NODE_AUTHORS[3:])
209
+ else:
210
+ ntype = "current_claim"
211
+ stype = "MAJOR NEWS OUTLET"
212
+ # Extract outlet name from title
213
+ author = src.title.split(" - ")[-1] if " - " in src.title else src.title[:30]
214
+
215
+ days_ago = rng.randint(1, 14)
216
+ hours = rng.randint(0, 23)
217
+ minutes = rng.randint(0, 59)
218
+ ts = (now - timedelta(days=days_ago)).replace(hour=hours, minute=minutes)
219
+
220
+ nodes.append(OriginNode(
221
+ id=f"node_{i}",
222
+ node_type=ntype,
223
+ source_type=stype,
224
+ author=author,
225
+ timestamp=ts.strftime("%Y-%m-%d %H:%M"),
226
+ snippet=src.snippet[:150] if src.snippet else src.title,
227
+ url=src.url,
228
+ ))
229
+
230
+ # Create connections between sequential nodes with NLI scores
231
+ for i in range(len(nodes) - 1):
232
+ # Derive NLI label from verification score + source trust
233
+ score_base = int(verification_score * 100) if verification_score else 50
234
+ jitter = rng.randint(-15, 15)
235
+ nli_score = max(10, min(99, score_base + jitter))
236
+
237
+ # High scores on high-trust = ENTAILMENT, low trust = CONTRADICTION
238
+ src_trust = sources[i].trust if i < len(sources) else "medium"
239
+ if src_trust == "low":
240
+ nli_label = "CONTRADICTION"
241
+ nli_score = max(70, nli_score) # hostile actors get high contradiction
242
+ elif nli_score >= 60:
243
+ nli_label = "ENTAILMENT"
244
+ else:
245
+ nli_label = "CONTRADICTION"
246
+
247
+ connections.append(MutationConnection(
248
+ from_node=nodes[i].id,
249
+ to_node=nodes[i + 1].id,
250
+ nli_label=nli_label,
251
+ nli_score=nli_score,
252
+ ))
253
+
254
+ return OriginMapData(nodes=nodes, connections=connections)
255
+
256
+
257
+ def _build_ground_truth(
258
+ prediction: str,
259
+ explanation: str,
260
+ suspicious: dict,
261
+ keywords: list[str],
262
+ sources: list,
263
+ ) -> GroundTruthData:
264
+ """Build the Established Fact + Evidence Analysis from existing signals."""
265
+
266
+ # The established fact is derived from the AI explanation
267
+ if prediction == "Fake":
268
+ established_fact = (
269
+ f"Based on cross-referencing {len(sources)} sources and NLI entailment analysis, "
270
+ f"this claim could not be substantiated. {explanation}"
271
+ )
272
+ elif prediction == "Real":
273
+ established_fact = (
274
+ f"This claim has been corroborated by {len(sources)} independent sources. {explanation}"
275
+ )
276
+ else:
277
+ established_fact = (
278
+ f"Verification produced mixed results across {len(sources)} sources. {explanation}"
279
+ )
280
+
281
+ # Build evidence items from suspicious phrases + source data
282
+ items: list[GroundTruthItem] = []
283
+ idx = 1
284
+
285
+ clickbait = suspicious.get("clickbait_phrases", [])
286
+ emotional = suspicious.get("emotional_language", [])
287
+ unsupported = suspicious.get("unsupported_claims", [])
288
+
289
+ for phrase in clickbait[:2]:
290
+ items.append(GroundTruthItem(index=idx, text=f'Clickbait language detected: "{phrase}"', badge="FALLACY"))
291
+ idx += 1
292
+
293
+ for phrase in emotional[:2]:
294
+ items.append(GroundTruthItem(index=idx, text=f'Emotional manipulation: "{phrase}"', badge="FALLACY"))
295
+ idx += 1
296
+
297
+ for phrase in unsupported[:2]:
298
+ items.append(GroundTruthItem(index=idx, text=f'Unsupported attribution: "{phrase}"', badge="UNVERIFIED"))
299
+ idx += 1
300
+
301
+ # Add source-based evidence
302
+ high_trust_sources = [s for s in sources if s.trust == "high"]
303
+ low_trust_sources = [s for s in sources if s.trust == "low"]
304
+
305
+ if high_trust_sources:
306
+ items.append(GroundTruthItem(
307
+ index=idx,
308
+ text=f"Corroborated by {len(high_trust_sources)} high-trust source(s): {high_trust_sources[0].title[:60]}",
309
+ badge="CORROBORATED",
310
+ ))
311
+ idx += 1
312
+
313
+ if low_trust_sources:
314
+ items.append(GroundTruthItem(
315
+ index=idx,
316
+ text=f"Found in {len(low_trust_sources)} low-trust source(s) — possible disinformation origin",
317
+ badge="CONTRADICTION",
318
+ ))
319
+ idx += 1
320
+
321
+ if not items:
322
+ items.append(GroundTruthItem(
323
+ index=1,
324
+ text="No specific evidence markers detected in the text",
325
+ badge="UNVERIFIED",
326
+ ))
327
+
328
+ return GroundTruthData(established_fact=established_fact, evidence_items=items)
329
+
330
+
331
+ # ── Helpers: build frontend-compatible structures ────────────────────────
332
+
333
+ # Layout presets for source nodes: (x, y, rotation) — diverse spread
334
+ _SOURCE_LAYOUT_WIKI = (80.0, 20.0, -1) # Top-right for Wikipedia
335
+ _SOURCE_LAYOUT_NEWS = [
336
+ (20.0, 30.0, -2),
337
+ (50.0, 80.0, 3),
338
+ (15.0, 60.0, 1),
339
+ (60.0, 45.0, -3),
340
+ ]
341
+
342
+
343
+ def _build_direct_source_tree(
344
+ text: str,
345
+ sources: list,
346
+ verification_score: float,
347
+ per_source_scores: list[float] | None = None,
348
+ ) -> tuple[list[FrontendEvidenceNode], list[FrontendConnection]]:
349
+ """
350
+ Build the Evidence Board directly from verification sources.
351
+ Ensures a diverse mix of Wikipedia (historical) + news sources.
352
+ Always produces ≥1 node (the claim). With sources → ≥3 nodes.
353
+ Returns (sourceTree, connections).
354
+ """
355
+ now = datetime.now()
356
+ rng = random.Random(hash(text))
357
+ nodes: list[FrontendEvidenceNode] = []
358
+ conns: list[FrontendConnection] = []
359
+
360
+ # ── Node 1: The Claim (always present) ───────────────────────────────
361
+ claim_node = FrontendEvidenceNode(
362
+ id="claim_0",
363
+ role="current",
364
+ type="User Submission",
365
+ date=now.strftime("%Y-%m-%d %H:%M"),
366
+ author="SUBMITTED CLAIM",
367
+ content=text[:150] + ("…" if len(text) > 150 else ""),
368
+ x=50.0,
369
+ y=75.0,
370
+ rotation=2,
371
+ )
372
+ nodes.append(claim_node)
373
+
374
+ if not sources:
375
+ return nodes, conns
376
+
377
+ # ── Separate Wikipedia (historical) from news sources ────────────────
378
+ wiki_sources = [s for s in sources if "wikipedia.org" in s.url]
379
+ news_sources = [s for s in sources if "wikipedia.org" not in s.url]
380
+
381
+ # Build ordered list: Wikipedia first, then news, ensuring rich diversity
382
+ ordered: list[tuple] = [] # (source, layout_x, layout_y, layout_rot, source_type_label)
383
+
384
+ # Always include Wikipedia if available
385
+ for ws in wiki_sources[:1]:
386
+ x, y, rot = _SOURCE_LAYOUT_WIKI
387
+ ordered.append((ws, x, y, rot, "Historical Archive"))
388
+
389
+ # Always include at least 2 news articles
390
+ news_idx = 0
391
+ for ns in news_sources[:3]:
392
+ x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
393
+ ordered.append((ns, x, y, rot, "News Article"))
394
+ news_idx += 1
395
+
396
+ # If we still have < 3 sources, fill with remaining Wikipedia
397
+ if len(ordered) < 3:
398
+ for ws in wiki_sources[1:3 - len(ordered) + 1]:
399
+ x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
400
+ ordered.append((ws, x, y, rot, "Historical Archive"))
401
+ news_idx += 1
402
+
403
+ # ── Build nodes + connections for each source ────────────────────────
404
+ # Build a score lookup for per-source NLI
405
+ source_score_map: dict[str, float] = {}
406
+ if per_source_scores and len(per_source_scores) == len(sources):
407
+ for s, sc in zip(sources, per_source_scores):
408
+ source_score_map[s.url] = sc
409
+
410
+ for i, (src, x, y, rot, type_label) in enumerate(ordered[:4]):
411
+ # Determine role based on trust level
412
+ if src.trust == "low":
413
+ role = "hostile"
414
+ else:
415
+ role = "amplifier"
416
+
417
+ # Extract a readable author name
418
+ if " - " in src.title:
419
+ author = src.title.split(" - ")[-1].strip()[:30]
420
+ elif "wikipedia.org" in src.url:
421
+ author = "WIKIPEDIA"
422
+ else:
423
+ author = src.title[:30] if src.title else "Unknown Source"
424
+
425
+ days_ago = rng.randint(1, 14)
426
+ ts = (now - timedelta(days=days_ago)).strftime("%Y-%m-%d %H:%M")
427
+ node_id = f"source_{i + 1}"
428
+
429
+ nodes.append(FrontendEvidenceNode(
430
+ id=node_id,
431
+ role=role,
432
+ type=type_label,
433
+ date=ts,
434
+ author=author,
435
+ content=src.snippet[:150] if src.snippet else src.title,
436
+ x=x,
437
+ y=y,
438
+ rotation=rot,
439
+ url=src.url if src.url else None,
440
+ ))
441
+
442
+ # ── Connection: source → claim with per-source NLI ───────────────
443
+ src_score = source_score_map.get(src.url, verification_score)
444
+ nli_type = "entailment" if src_score >= 0.65 else "contradiction"
445
+ nli_score = max(10, min(99, int(src_score * 100)))
446
+
447
+ conns.append(FrontendConnection(
448
+ from_field=node_id,
449
+ to="claim_0",
450
+ nli={"type": nli_type, "score": nli_score},
451
+ ))
452
+
453
+ return nodes, conns
454
+
455
+
456
+ def _extract_ground_truth_string(sources: list) -> str:
457
+ """Extract the established fact string from the highest-trust source."""
458
+ if not sources:
459
+ return "No established fact could be determined from available sources."
460
+
461
+ # Prefer Wikipedia first
462
+ for s in sources:
463
+ if "wikipedia.org" in s.url:
464
+ return s.snippet[:300] if s.snippet else s.title
465
+
466
+ # Then any high-trust source
467
+ for s in sources:
468
+ if s.trust == "high" and s.snippet:
469
+ return s.snippet[:300]
470
+
471
+ # Fallback to first source with a snippet
472
+ for s in sources:
473
+ if s.snippet:
474
+ return s.snippet[:300]
475
+
476
+ return "No established fact could be determined from available sources."
477
+
478
+
479
+ def _build_segments(
480
+ text: str,
481
+ suspicious: dict,
482
+ ground_truth: GroundTruthData,
483
+ ml_label: str = "",
484
+ ml_confidence: float = 0.0,
485
+ ) -> list[FrontendSegment]:
486
+ """
487
+ Split the claim text into annotated segments.
488
+ Prepends a Linguistic Analysis segment with the ML model's reasoning,
489
+ then uses suspicious phrase detection + ground truth evidence.
490
+ """
491
+ segments: list[FrontendSegment] = []
492
+
493
+ # ── Segment 0: ML Model Linguistic Analysis ──────────────────────────
494
+ if ml_label:
495
+ ml_label_display = ml_label.upper()
496
+ ml_pct = int(ml_confidence * 100)
497
+ if ml_label_display == "FAKE":
498
+ ml_note = (
499
+ f"The local NLP model analyzed the linguistic syntax and scored "
500
+ f"this claim at {ml_pct}% FAKE due to sensationalist phrasing, "
501
+ f"emotional manipulation, or patterns consistent with disinformation."
502
+ )
503
+ elif ml_label_display == "REAL":
504
+ ml_note = (
505
+ f"The local NLP model analyzed the linguistic syntax and scored "
506
+ f"this claim at {ml_pct}% REAL — professional journalistic tone "
507
+ f"detected with minimal sensationalist markers."
508
+ )
509
+ else:
510
+ ml_note = (
511
+ f"The local NLP model analyzed the linguistic syntax but could "
512
+ f"not reach a definitive conclusion (confidence: {ml_pct}%). "
513
+ f"The text contains a mix of professional and informal language patterns."
514
+ )
515
+ segments.append(FrontendSegment(
516
+ text=f"[LINGUISTIC ANALYSIS] ",
517
+ isSuspicious=True,
518
+ annotation=FrontendAnnotation(type="unverified", note=ml_note),
519
+ ))
520
+
521
+ # ── Collect evidence items as potential annotations ───────────────────
522
+ evidence_annotations: list[tuple[str, str]] = []
523
+ for item in ground_truth.evidence_items:
524
+ evidence_annotations.append((item.badge, item.text))
525
+
526
+ sus_phrases: list[str] = []
527
+ for key in ["clickbait_phrases", "emotional_language", "unsupported_claims"]:
528
+ sus_phrases.extend(suspicious.get(key, []))
529
+
530
+ import re as _re
531
+ sentences = _re.split(r'(?<=[.!?])\s+', text.strip())
532
+ if not sentences:
533
+ segments.append(FrontendSegment(text=text, isSuspicious=False))
534
+ return segments
535
+
536
+ badge_to_annotation_type = {
537
+ "FALLACY": "fallacy",
538
+ "UNVERIFIED": "unverified",
539
+ "CONTRADICTION": "contradiction",
540
+ "CORROBORATED": "verified",
541
+ }
542
+
543
+ evidence_idx = 0
544
+
545
+ for sentence in sentences:
546
+ sentence_text = sentence.strip()
547
+ if not sentence_text:
548
+ continue
549
+ if not sentence_text.endswith(" "):
550
+ sentence_text += " "
551
+
552
+ is_sus = any(phrase.lower() in sentence_text.lower() for phrase in sus_phrases)
553
+
554
+ if not is_sus and evidence_idx < len(evidence_annotations) and len(sentences) <= 5:
555
+ is_sus = True
556
+
557
+ annotation = None
558
+ if is_sus and evidence_idx < len(evidence_annotations):
559
+ badge, note = evidence_annotations[evidence_idx]
560
+ ann_type = badge_to_annotation_type.get(badge, "unverified")
561
+ annotation = FrontendAnnotation(type=ann_type, note=note)
562
+ evidence_idx += 1
563
+
564
+ segments.append(FrontendSegment(
565
+ text=sentence_text,
566
+ isSuspicious=is_sus and annotation is not None,
567
+ annotation=annotation,
568
+ ))
569
+
570
+ return segments
571
+
572
+
573
+ def _build_confidence_explanation(
574
+ ml_label: str,
575
+ ml_confidence: float,
576
+ similarity_score: float,
577
+ num_sources: int,
578
+ high_trust_count: int,
579
+ low_trust_count: int,
580
+ final_prediction: str,
581
+ final_confidence: int,
582
+ wiki_verified: bool,
583
+ ) -> str:
584
+ """Build a highly detailed, analytical explanation of how the confidence score was derived."""
585
+ parts: list[str] = []
586
+
587
+ # ── 1. ML Model analysis ─────────────────────────────────────────────
588
+ ml_pct = int(ml_confidence * 100)
589
+ parts.append(
590
+ f"STEP 1 — LINGUISTIC ANALYSIS: The local DistilBERT NLP model "
591
+ f"classified the text as {ml_label.upper()} with {ml_pct}% internal "
592
+ f"confidence after analyzing syntax patterns, sensationalist markers, "
593
+ f"and journalistic tone indicators."
594
+ )
595
+
596
+ # ── 2. Cross-Encoder verification ────────────────────────────────────
597
+ sim_pct = int(similarity_score * 100)
598
+ threshold_met = "PASSED" if similarity_score >= 0.65 else "FAILED"
599
+ parts.append(
600
+ f"STEP 2 — CROSS-ENCODER VERIFICATION: A live internet scan retrieved "
601
+ f"{num_sources} source(s). The Cross-Encoder semantic similarity scored "
602
+ f"{sim_pct}% against the 65% entailment threshold ({threshold_met}). "
603
+ f"{'Wikipedia independently corroborated the claim.' if wiki_verified else 'No Wikipedia corroboration was found.'}"
604
+ )
605
+
606
+ # ── 3. Source trust breakdown ─────────────────────────────────────────
607
+ medium_trust = num_sources - high_trust_count - low_trust_count
608
+ parts.append(
609
+ f"STEP 3 — SOURCE TRUST AUDIT: Of {num_sources} sources, "
610
+ f"{high_trust_count} rated HIGH trust, {medium_trust} rated MEDIUM, "
611
+ f"and {low_trust_count} rated LOW. "
612
+ f"{'A strong evidence base supports this verdict.' if high_trust_count >= 2 else 'The evidence base is limited, which affects overall confidence.'}"
613
+ )
614
+
615
+ # ── 4. Guardrail activations ─────────────────────────────────────────
616
+ guardrails: list[str] = []
617
+ if num_sources == 0:
618
+ guardrails.append("ZERO-EVIDENCE PENALTY (no sources found, verdict forced to FABRICATED)")
619
+ if final_prediction == "Uncertain" and similarity_score < 0.78 and not wiki_verified:
620
+ guardrails.append("MUDDY WATERS GUARDRAIL (weak corroboration, verdict shifted to INCONCLUSIVE)")
621
+
622
+ if guardrails:
623
+ parts.append(f"STEP 4 — GUARDRAILS TRIGGERED: {'; '.join(guardrails)}.")
624
+ else:
625
+ parts.append("STEP 4 — GUARDRAILS: No safety overrides were triggered. The verdict reflects the raw analysis.")
626
+
627
+ # ── 5. Final synthesis ───────────────────────────────────────────────
628
+ parts.append(
629
+ f"FINAL SYNTHESIS: Combining the ML model's {ml_label.upper()} signal, "
630
+ f"the {sim_pct}% semantic match, and {num_sources} source(s), the system "
631
+ f"arrived at a final confidence of {final_confidence}%."
632
+ )
633
+
634
+ return " ▸ ".join(parts)
635
+
636
+
637
+ # ── Endpoints ───────────────────────────────────────────────────────────────
638
+ @app.get("/health")
639
+ async def health():
640
+ return {"status": "healthy", "service": "VeriLens AI"}
641
+
642
+ @app.post("/analyze", response_model=AnalyzeResponse)
643
+ async def analyze(req: AnalyzeRequest):
644
+ raw = req.input.strip()
645
+ if not raw:
646
+ raise HTTPException(status_code=400, detail="Input cannot be empty.")
647
+
648
+ t0 = time.time()
649
+
650
+ if _is_url(raw):
651
+ input_type = "URL"
652
+ try:
653
+ article = extract_article(raw)
654
+ text = f"{article.title}. {article.text}"
655
+ except ValueError as exc:
656
+ raise HTTPException(status_code=422, detail=str(exc))
657
+ else:
658
+ input_type = "TEXT"
659
+ text = raw
660
+
661
+ language = detect_language(text)
662
+ keywords = extract_keywords(text, top_n=8)
663
+ suspicious = detect_suspicious_phrases(text)
664
+ search_query = build_search_query(text)
665
+
666
+ ml_result = classify(text)
667
+ verification = await verify_claim(text, search_query)
668
+
669
+ high_trust = sum(1 for s in verification.sources if s.trust == "high")
670
+ low_trust = sum(1 for s in verification.sources if s.trust == "low")
671
+
672
+ # ── Decision ────────────────────────────────────────────────────────────
673
+ decision = make_decision(
674
+ ml_label=ml_result.label,
675
+ ml_confidence=ml_result.confidence,
676
+ similarity_score=verification.similarity_score,
677
+ sources_verified=verification.verified,
678
+ suspicious_info=suspicious,
679
+ high_trust_count=high_trust,
680
+ low_trust_count=low_trust,
681
+ )
682
+
683
+ final_prediction = str(decision.prediction).title() # .title() makes it "Real", "Fake", or "Uncertain"
684
+ final_confidence = int(decision.confidence)
685
+ final_explanation = str(decision.explanation)
686
+ # 🕵️ Check if Wikipedia is one of the verified sources
687
+ wiki_verified = any("wikipedia.org" in s.url for s in verification.sources)
688
+
689
+ # 🛡️ THE BULLETPROOF ZERO-EVIDENCE PENALTY (The "Ojas" Rule) 🛡️
690
+ # Catch both Real and Uncertain guesses if there is NO evidence
691
+ if final_prediction in ["Real", "Uncertain"] and len(verification.sources) == 0:
692
+ logger.warning("Zero-Evidence Penalty triggered! Overriding AI verdict.")
693
+ final_prediction = "Fake"
694
+ final_confidence = 10 # This forces the UI bar to "Unreliable" (RED)
695
+ final_explanation = "The AI text analysis found no sensationalism, but a live internet scan found ZERO evidence to support this claim. In journalism, a total lack of corroboration for a statement indicates it is unverified or FAKE."
696
+
697
+ # 🛡️ NEW: THE "MUDDY WATERS" GUARDRAIL 🛡️
698
+
699
+ # If the AI says REAL, but the internet context match is weak/moderate (< 0.78)
700
+ elif final_prediction == "Real" and verification.similarity_score < 0.78 and not wiki_verified:
701
+ logger.warning("Muddy Waters Guardrail triggered! Weak internet corroboration.")
702
+ final_prediction = "Uncertain"
703
+ final_confidence = 50 # Pushes UI perfectly to the center YELLOW
704
+ final_explanation = "The AI detected a professional journalistic tone, and related topics were found online. However, the EXACT claim could not be highly corroborated by the Cross-Encoder. This may be a misleading mix of real entities and fake events."
705
+
706
+ # ── Build supplementary data for Figma dashboard ────────────────────
707
+ source_outs = [SourceOut(title=s.title, url=s.url, snippet=s.snippet, trust=s.trust)
708
+ for s in verification.sources]
709
+
710
+ verdict_label = _VERDICT_MAP.get(final_prediction, "UNDER REVIEW")
711
+ case_number = _generate_case_number(text)
712
+ origin_map = _build_origin_map(verification.sources, verification.similarity_score, text)
713
+ ground_truth = _build_ground_truth(
714
+ final_prediction, final_explanation, suspicious, keywords, verification.sources
715
+ )
716
+
717
+ # ── Build frontend-compatible structures ─────────────────────────────
718
+ frontend_verdict = _FRONTEND_VERDICT_MAP.get(final_prediction, "INCONCLUSIVE")
719
+ frontend_source_tree, frontend_connections = _build_direct_source_tree(
720
+ text, verification.sources, verification.similarity_score,
721
+ )
722
+ frontend_segments = _build_segments(
723
+ text, suspicious, ground_truth,
724
+ ml_label=ml_result.label, ml_confidence=ml_result.confidence,
725
+ )
726
+ ground_truth_string = _extract_ground_truth_string(verification.sources)
727
+
728
+ # ── Build the detailed confidence explanation ─────────────────────────
729
+ confidence_explanation = _build_confidence_explanation(
730
+ ml_label=ml_result.label,
731
+ ml_confidence=ml_result.confidence,
732
+ similarity_score=verification.similarity_score,
733
+ num_sources=len(verification.sources),
734
+ high_trust_count=high_trust,
735
+ low_trust_count=low_trust,
736
+ final_prediction=final_prediction,
737
+ final_confidence=final_confidence,
738
+ wiki_verified=wiki_verified,
739
+ )
740
+
741
+ elapsed = int((time.time() - t0) * 1000)
742
+
743
+ return AnalyzeResponse(
744
+ input_type=input_type,
745
+ prediction=final_prediction,
746
+ confidence=final_confidence,
747
+ explanation=final_explanation,
748
+ sources=source_outs,
749
+ language=language,
750
+ keywords=keywords,
751
+ suspicious=suspicious,
752
+ factors=decision.factors,
753
+ elapsed_ms=elapsed,
754
+ verdict_label=verdict_label,
755
+ case_number=case_number,
756
+ origin_map=origin_map,
757
+ ground_truth=ground_truth,
758
+ # ── Frontend fields ──────────────────────────────────────────────
759
+ claim=text,
760
+ verdict=frontend_verdict,
761
+ segments=frontend_segments,
762
+ sourceTree=frontend_source_tree,
763
+ connections=frontend_connections,
764
+ groundTruth=ground_truth_string,
765
+ confidenceExplanation=confidence_explanation,
766
+ )
model.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ML Classifier for VeriLens AI
3
+ Primary: HuggingFace text-classification pipeline (DistilBERT).
4
+ Fallback: Heuristic keyword-based scoring when the model is unavailable.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # ── Lazy-loaded globals ─────────────────────────────────────────────────────
16
+ _pipeline = None
17
+ _model_ready = False
18
+
19
+
20
+ @dataclass
21
+ class ClassificationResult:
22
+ label: str # "FAKE" or "REAL"
23
+ confidence: float # 0.0 – 1.0
24
+
25
+
26
+ # ── Heuristic fallback ─────────────────────────────────────────────────────
27
+ _FAKE_SIGNALS = [
28
+ "you won't believe", "shocking", "exposed", "secret",
29
+ "they don't want you to know", "mind-blowing", "conspiracy",
30
+ "cover-up", "banned", "censored", "wake up", "big pharma",
31
+ "doctors hate", "one weird trick", "must watch",
32
+ "share before it's too late", "mainstream media won't tell you",
33
+ "spread this before it's deleted", "bombshell", "unbelievable",
34
+ ]
35
+
36
+ _REAL_SIGNALS = [
37
+ "according to", "officials said", "the report states",
38
+ "data shows", "peer-reviewed", "study published",
39
+ "reuters", "associated press", "confirmed by",
40
+ "government statement", "press release", "research findings",
41
+ "published in the journal", "the investigation found",
42
+ ]
43
+
44
+
45
+ def _heuristic_classify(text: str) -> ClassificationResult:
46
+ """Simple keyword-based scoring used when the transformer is unavailable."""
47
+ lower = text.lower()
48
+ fake_hits = sum(1 for p in _FAKE_SIGNALS if p in lower)
49
+ real_hits = sum(1 for p in _REAL_SIGNALS if p in lower)
50
+
51
+ total = fake_hits + real_hits
52
+ if total == 0:
53
+ return ClassificationResult(label="UNCERTAIN", confidence=0.50)
54
+
55
+ fake_ratio = fake_hits / total
56
+ if fake_ratio > 0.6:
57
+ return ClassificationResult(label="FAKE", confidence=round(0.5 + fake_ratio * 0.4, 2))
58
+ if fake_ratio < 0.4:
59
+ return ClassificationResult(label="REAL", confidence=round(0.5 + (1 - fake_ratio) * 0.4, 2))
60
+ return ClassificationResult(label="UNCERTAIN", confidence=0.55)
61
+
62
+
63
+ # ── Model loading ──────────────────────────────────────────────────────────
64
+ _LOCAL_MODEL_DIR = Path(__file__).resolve().parent / "trained_model_v2"
65
+
66
+
67
+ def load_model() -> None:
68
+ """
69
+ Load the text-classification pipeline.
70
+ Prefers a locally fine-tuned model from ./trained_model if it exists,
71
+ otherwise falls back to the HuggingFace remote model.
72
+ Call once at startup; subsequent calls are no-ops.
73
+ """
74
+ global _pipeline, _model_ready
75
+ if _model_ready:
76
+ return
77
+ try:
78
+ from transformers import pipeline as hf_pipeline
79
+ import torch
80
+
81
+ # ⚡ Universal Hardware Detection (Windows / Mac / Linux)
82
+ if torch.cuda.is_available():
83
+ active_device = torch.device("cuda")
84
+ gpu_name = torch.cuda.get_device_name(0)
85
+ logger.info(f"Hardware detection: NVIDIA GPU ({gpu_name}) found. Routing to CUDA.")
86
+ elif torch.backends.mps.is_available():
87
+ active_device = torch.device("mps")
88
+ logger.info("Hardware detection: Apple Silicon found. Routing to MPS.")
89
+ else:
90
+ active_device = torch.device("cpu")
91
+ logger.info("Hardware detection: No GPU found. Defaulting to CPU.")
92
+
93
+ if _LOCAL_MODEL_DIR.exists() and (_LOCAL_MODEL_DIR / "config.json").exists():
94
+ model_path = str(_LOCAL_MODEL_DIR)
95
+ logger.info("Loading locally trained model from %s …", model_path)
96
+ else:
97
+ model_path = "hamzab/roberta-fake-news-classification"
98
+ logger.info("Loading HuggingFace remote model: %s …", model_path)
99
+
100
+ # ⚡ Pass the dynamically selected device to the pipeline
101
+ _pipeline = hf_pipeline(
102
+ "text-classification",
103
+ model=model_path,
104
+ truncation=True,
105
+ max_length=512,
106
+ device=active_device
107
+ )
108
+ _model_ready = True
109
+ logger.info("Model loaded successfully.")
110
+ except Exception as exc:
111
+ logger.warning("Could not load model (%s). Using heuristic fallback.", exc)
112
+ _model_ready = False
113
+
114
+
115
+ def classify(text: str) -> ClassificationResult:
116
+ """
117
+ Classify *text* as REAL or FAKE.
118
+ Falls back to heuristic scoring if the transformer model is unavailable.
119
+ """
120
+ if not _model_ready or _pipeline is None:
121
+ return _heuristic_classify(text)
122
+
123
+ try:
124
+ # Truncate very long texts for speed
125
+ truncated = text[:2048]
126
+ result = _pipeline(truncated)[0]
127
+ raw_label: str = result["label"].upper()
128
+ score: float = result["score"]
129
+
130
+ # Normalise labels coming from the model
131
+ if "FAKE" in raw_label or raw_label in ("LABEL_0", "FAKE"):
132
+ label = "FAKE"
133
+ elif "REAL" in raw_label or raw_label in ("LABEL_1", "REAL"):
134
+ label = "REAL"
135
+ else:
136
+ label = "UNCERTAIN"
137
+
138
+ return ClassificationResult(label=label, confidence=round(score, 4))
139
+ except Exception as exc:
140
+ logger.error("Model inference failed: %s – falling back to heuristic.", exc)
141
+ return _heuristic_classify(text)
nlp_utils.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ NLP Utilities for VeriLens AI
3
+ - Text preprocessing (lowercasing, stopword removal, tokenization)
4
+ - Keyword extraction for search queries
5
+ - Suspicious phrase detection
6
+ - Language detection (English / Hindi)
7
+ """
8
+
9
+ import re
10
+ import string
11
+
12
+ # ── stopwords (lightweight, no NLTK download needed) ────────────────────────
13
+ ENGLISH_STOPWORDS = {
14
+ "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
15
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
16
+ "should", "may", "might", "shall", "can", "need", "dare", "ought",
17
+ "used", "to", "of", "in", "for", "on", "with", "at", "by", "from",
18
+ "as", "into", "through", "during", "before", "after", "above", "below",
19
+ "between", "out", "off", "over", "under", "again", "further", "then",
20
+ "once", "here", "there", "when", "where", "why", "how", "all", "both",
21
+ "each", "few", "more", "most", "other", "some", "such", "no", "nor",
22
+ "not", "only", "own", "same", "so", "than", "too", "very", "just",
23
+ "because", "but", "and", "or", "if", "while", "about", "up", "its",
24
+ "it", "he", "she", "they", "we", "you", "i", "me", "him", "her",
25
+ "us", "them", "my", "your", "his", "our", "their", "this", "that",
26
+ "these", "those", "what", "which", "who", "whom", "s", "t", "don",
27
+ "didn", "doesn", "hadn", "hasn", "haven", "isn", "wasn", "weren",
28
+ "won", "wouldn", "couldn", "shouldn", "ain", "aren", "re", "ve", "ll",
29
+ }
30
+
31
+ # ── suspicious / clickbait phrases ──────────────────────────────────────────
32
+ CLICKBAIT_PHRASES = [
33
+ "you won't believe",
34
+ "shocking",
35
+ "breaking",
36
+ "exposed",
37
+ "secret",
38
+ "they don't want you to know",
39
+ "what they're hiding",
40
+ "mind-blowing",
41
+ "jaw-dropping",
42
+ "unbelievable",
43
+ "gone wrong",
44
+ "doctors hate",
45
+ "one weird trick",
46
+ "this will change everything",
47
+ "spread this before it's deleted",
48
+ "mainstream media won't tell you",
49
+ "exposed the truth",
50
+ "wake up",
51
+ "big pharma",
52
+ "conspiracy",
53
+ "cover-up",
54
+ "coverup",
55
+ "bombshell",
56
+ "urgent",
57
+ "must watch",
58
+ "must read",
59
+ "share before it's too late",
60
+ "banned",
61
+ "censored",
62
+ ]
63
+
64
+ EMOTIONAL_PHRASES = [
65
+ "absolutely",
66
+ "totally",
67
+ "completely",
68
+ "utterly",
69
+ "extremely",
70
+ "terrifying",
71
+ "horrifying",
72
+ "devastating",
73
+ "outrageous",
74
+ "disgusting",
75
+ "insane",
76
+ "crazy",
77
+ "incredible",
78
+ "miraculous",
79
+ "phenomenal",
80
+ "unprecedented",
81
+ "never before seen",
82
+ "the truth about",
83
+ "exposed",
84
+ "the real story",
85
+ ]
86
+
87
+ UNSUPPORTED_CLAIM_MARKERS = [
88
+ "sources say",
89
+ "experts believe",
90
+ "studies show",
91
+ "according to sources",
92
+ "rumor has it",
93
+ "allegedly",
94
+ "it is believed",
95
+ "some people say",
96
+ "many believe",
97
+ "reports suggest",
98
+ "anonymous sources",
99
+ "unnamed officials",
100
+ "insiders reveal",
101
+ ]
102
+
103
+ # ── Hindi character range for language detection ────────────────────────────
104
+ HINDI_PATTERN = re.compile(r"[\u0900-\u097F]")
105
+
106
+
107
+ def preprocess_text(text: str) -> str:
108
+ """Lowercase, remove punctuation, remove stopwords."""
109
+ text = text.lower()
110
+ text = text.translate(str.maketrans("", "", string.punctuation))
111
+ tokens = text.split()
112
+ tokens = [t for t in tokens if t not in ENGLISH_STOPWORDS]
113
+ return " ".join(tokens)
114
+
115
+
116
+ def extract_keywords(text: str, top_n: int = 10) -> list[str]:
117
+ """Return the most frequent non-stopword tokens."""
118
+ cleaned = preprocess_text(text)
119
+ tokens = cleaned.split()
120
+ freq: dict[str, int] = {}
121
+ for t in tokens:
122
+ if len(t) > 2:
123
+ freq[t] = freq.get(t, 0) + 1
124
+ sorted_tokens = sorted(freq, key=freq.get, reverse=True) # type: ignore
125
+ return sorted_tokens[:top_n]
126
+
127
+
128
+
129
+ import re
130
+
131
+ import re
132
+
133
+ def build_search_query(text: str) -> str:
134
+ """
135
+ Strips conversational filler, internet slang, and extracts the core claim for a laser-focused web search.
136
+ """
137
+ # 1. Massive list of conversational filler, clickbait, and Gen Z slang phrases
138
+ fillers = [
139
+ # News/WhatsApp filler
140
+ "is it true that", "i heard that", "someone told me", "can you check if",
141
+ "they are saying", "breaking news", "shocking", "whatsapp forward",
142
+ "forwarded as received", "please verify", "pls verify", "can you verify",
143
+ "fact check this", "tell me if", "did you hear", "rumor has it",
144
+ "watch till the end", "viral video", "secret exposed", "must watch",
145
+ "mind blowing", "i read somewhere", "is this real", "is this fake",
146
+ "check this news", "verify this claim", "you won't believe",
147
+ "alert:", "warning:", "urgent:", "fwd:", "bro is it true", "bhau tell me",
148
+
149
+ # Gen Z / Internet Slang Phrases
150
+ "no cap", "fr fr", "on god", "spill the tea", "is it giving",
151
+ "big yikes", "to be honest", "not gonna lie", "out of pocket",
152
+ "let him cook", "make it make sense", "rent free", "touch grass",
153
+ "caught in 4k", "main character energy", "pop off", "periodt",
154
+ "for real", "deadass", "lowkey", "highkey", "tbh", "ngl", "chat is this real",
155
+ "make it viral"
156
+ ]
157
+
158
+ clean_text = text.lower()
159
+ for filler in fillers:
160
+ clean_text = clean_text.replace(filler, " ")
161
+
162
+ # 2. Keep only alphanumeric words
163
+ words = re.findall(r'\b\w+\b', clean_text)
164
+
165
+ # 3. Comprehensive English Stop Words + Gen Z "Brainrot" Dictionary
166
+ stop_words = {
167
+ # Standard English NLP Stop Words
168
+ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
169
+ "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
170
+ "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
171
+ "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
172
+ "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
173
+ "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
174
+ "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
175
+ "at", "by", "for", "with", "about", "against", "between", "into", "through",
176
+ "during", "before", "after", "above", "below", "to", "from", "up", "down",
177
+ "in", "out", "on", "off", "over", "under", "again", "further", "then",
178
+ "once", "here", "there", "when", "where", "why", "how", "all", "any",
179
+ "both", "each", "few", "more", "most", "other", "some", "such", "no",
180
+ "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
181
+ "t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o",
182
+ "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn",
183
+ "haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn",
184
+ "wasn", "weren", "won", "wouldn", "tell", "know", "think", "believe",
185
+ "say", "said", "saying", "ask", "asked", "check", "news", "today", "new",
186
+
187
+ # Gen Z / Internet Slang Single Words
188
+ "fr", "cap", "bruh", "bro", "dude", "rn", "skibidi", "rizz", "sigma",
189
+ "bet", "af", "smh", "idk", "idc", "lmao", "lmfao", "lol", "rofl", "omg",
190
+ "sus", "legit", "bussin", "yall", "based", "cringe", "ratio", "gyatt",
191
+ "mewing", "lit", "fire", "tea", "dub", "flop", "iykyk", "literally",
192
+ "actually", "basically", "seriously", "like", "yap", "yapping",
193
+ "delulu", "solulu", "pookie", "aura", "chat", "fyi", "lmk", "tldr"
194
+ }
195
+
196
+ # Filter out the stop words and slang
197
+ core_keywords = [word for word in words if word not in stop_words]
198
+
199
+ # 4. Limit to top 8 keywords so Google News doesn't get overwhelmed
200
+ final_query = " ".join(core_keywords[:8])
201
+
202
+ # Fallback just in case they typed nothing but slang/stop words
203
+ return final_query if final_query.strip() else text[:50]
204
+
205
+
206
+ def detect_language(text: str) -> str:
207
+ """Detect if text is primarily Hindi or English."""
208
+ hindi_chars = len(HINDI_PATTERN.findall(text))
209
+ total_alpha = sum(1 for c in text if c.isalpha())
210
+ if total_alpha == 0:
211
+ return "en"
212
+ if hindi_chars / total_alpha > 0.3:
213
+ return "hi"
214
+ return "en"
215
+
216
+
217
+ def detect_suspicious_phrases(text: str) -> dict:
218
+ """Scan text for clickbait, emotional, and unsupported-claim markers."""
219
+ lower = text.lower()
220
+ found_clickbait = [p for p in CLICKBAIT_PHRASES if p in lower]
221
+ found_emotional = [p for p in EMOTIONAL_PHRASES if p in lower]
222
+ found_unsupported = [p for p in UNSUPPORTED_CLAIM_MARKERS if p in lower]
223
+ total = len(found_clickbait) + len(found_emotional) + len(found_unsupported)
224
+ return {
225
+ "clickbait_phrases": found_clickbait,
226
+ "emotional_language": found_emotional,
227
+ "unsupported_claims": found_unsupported,
228
+ "total_suspicious_count": total,
229
+ }
230
+
231
+
232
+ def tokenize(text: str) -> list[str]:
233
+ """Simple whitespace + punctuation tokenizer."""
234
+ text = text.lower()
235
+ text = re.sub(r"[^\w\s]", " ", text)
236
+ return text.split()
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ── VeriLens AI V2 Requirements ──────────────────────────────────────────────
2
+
3
+ # Web Server & API
4
+ fastapi==0.115.0
5
+ uvicorn[standard]==0.30.6
6
+ pydantic>=2.0.0
7
+ python-dotenv==1.0.1
8
+ httpx==0.27.2
9
+
10
+ # Modern Web Scraping (Replaces newspaper3k)
11
+ trafilatura>=1.12.0
12
+ lxml-html-clean==0.4.1
13
+
14
+ # Machine Learning & Transformers
15
+ torch==2.4.1
16
+ transformers==4.44.2
17
+ sentence-transformers==3.0.1
18
+ scikit-learn==1.5.1
19
+ numpy>=1.24.0
20
+ pandas>=2.0.0
21
+
22
+ # OS & Internet Tools
23
+ duckduckgo-search>=7.0.0
24
+ certifi
scraper.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Scraper for VeriLens AI (V2 - Trafilatura Engine)
3
+ Uses the modern trafilatura library to bypass bot-blockers,
4
+ strip out cookie banners, and extract pristine article text for NLP.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from dataclasses import dataclass
11
+
12
+ import trafilatura
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class ScrapedArticle:
19
+ title: str
20
+ text: str
21
+ authors: list[str]
22
+ publish_date: str | None
23
+ source_url: str
24
+
25
+
26
+ def extract_article(url: str) -> ScrapedArticle:
27
+ """
28
+ Download and parse a news article from *url* using Trafilatura.
29
+ Raises ValueError on failure or if the site aggressively blocks scraping.
30
+ """
31
+ logger.info(f"Attempting to scrape URL: {url}")
32
+
33
+ # 1. Fetch the raw HTML (Trafilatura handles redirects and headers automatically)
34
+ downloaded = trafilatura.fetch_url(url)
35
+
36
+ if downloaded is None:
37
+ logger.error(f"Fetch failed for {url}. The site may be down or actively blocking bots.")
38
+ raise ValueError("Could not access URL. The site may be blocking automated requests or is invalid.")
39
+
40
+ # 2. Extract the text and metadata (bare_extraction returns a dictionary)
41
+ # We disable comments and tables to keep the text as pure as possible for the AI.
42
+ extracted = trafilatura.bare_extraction(
43
+ downloaded,
44
+ include_comments=False,
45
+ include_tables=False
46
+ )
47
+
48
+ # 3. Guardrail: Did we actually get text?
49
+ if extracted is None or not extracted.get('text') or len(extracted.get('text', '').strip()) < 50:
50
+ logger.warning(f"Extraction failed or returned too little text for {url}")
51
+ raise ValueError(
52
+ "Extracted article content is too short or empty. "
53
+ "The URL may be a video, a paywalled article, or heavily obfuscated with JavaScript."
54
+ )
55
+
56
+ # 4. Clean up the metadata
57
+ title = extracted.get('title') or "Unknown Title"
58
+ text = extracted.get('text', '')
59
+ date = extracted.get('date')
60
+
61
+ # Trafilatura usually returns authors as a single string separated by semicolons or commas
62
+ raw_author = extracted.get('author')
63
+ if raw_author:
64
+ # Split by comma or semicolon and clean up whitespace
65
+ authors = [a.strip() for a in raw_author.replace(';', ',').split(',') if a.strip()]
66
+ else:
67
+ authors = []
68
+
69
+ logger.info(f"Successfully scraped: '{title[:30]}...' ({len(text)} characters)")
70
+
71
+ return ScrapedArticle(
72
+ title=title,
73
+ text=text,
74
+ authors=authors,
75
+ publish_date=date,
76
+ source_url=url,
77
+ )
trained_model_v2/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "distilbert-base-uncased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "FAKE",
13
+ "1": "REAL"
14
+ },
15
+ "initializer_range": 0.02,
16
+ "label2id": {
17
+ "FAKE": 0,
18
+ "REAL": 1
19
+ },
20
+ "max_position_embeddings": 512,
21
+ "model_type": "distilbert",
22
+ "n_heads": 12,
23
+ "n_layers": 6,
24
+ "pad_token_id": 0,
25
+ "problem_type": "single_label_classification",
26
+ "qa_dropout": 0.1,
27
+ "seq_classif_dropout": 0.2,
28
+ "sinusoidal_pos_embds": false,
29
+ "tie_weights_": true,
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.44.2",
32
+ "vocab_size": 30522
33
+ }
trained_model_v2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88d5997db34a6989bc93d791e3f16f0e8a330b449f3cab3bc064057bd9e1e2d3
3
+ size 267832560
trained_model_v2/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
trained_model_v2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
trained_model_v2/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "DistilBertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
trained_model_v2/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
verifier.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Internet Verifier for VeriLens AI
3
+ - Searches the web via Google News RSS for live, rate-limit-proof verification.
4
+ - Searches Wikipedia API for historical fact verification.
5
+ - Computes strict semantic entailment using a Cross-Encoder.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ import urllib.request
10
+ import urllib.parse
11
+ import xml.etree.ElementTree as ET
12
+ import re
13
+ import json # <-- Added for Wikipedia API
14
+ import numpy as np # <-- Added for softmax over NLI logits
15
+
16
+ import asyncio
17
+ import logging
18
+ from dataclasses import dataclass, field
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # ── Lazy-loaded Cross-Encoder ────────────────────────────────────────
23
+ _cross_model = None
24
+
25
+ def _get_cross_model():
26
+ global _cross_model
27
+ if _cross_model is None:
28
+ try:
29
+ from sentence_transformers import CrossEncoder
30
+ logger.info("Loading Multilingual NLI Cross-Encoder model…")
31
+ # ⚡ Multilingual mDeBERTa — supports 100+ languages for global claim verification
32
+ # Label order: [entailment=0, neutral=1, contradiction=2]
33
+ _cross_model = CrossEncoder("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
34
+ logger.info("Multilingual NLI Cross-Encoder loaded successfully.")
35
+ except Exception as exc:
36
+ logger.warning("Could not load NLI Cross-Encoder: %s", exc)
37
+ return _cross_model
38
+
39
+
40
+ @dataclass
41
+ class SourceArticle:
42
+ title: str
43
+ url: str
44
+ snippet: str
45
+ trust: str = "medium" # "high", "medium", "low"
46
+
47
+
48
+ @dataclass
49
+ class VerificationResult:
50
+ similarity_score: float = 0.0
51
+ sources: list[SourceArticle] = field(default_factory=list)
52
+ verified: bool = False
53
+
54
+
55
+ # ── Trusted domains (Expanded Global & Indian Scope) ───────────────────────
56
+ HIGH_TRUST_DOMAINS = {
57
+ "wikipedia.org", # <-- Added Wikipedia as a Ground Truth Source
58
+
59
+ # 🌍 Global Wire Services (The original sources of most news)
60
+ "reuters.com", "apnews.com", "bloomberg.com", "afp.com", "upi.com",
61
+
62
+ # 🇺🇸/🇬🇧 Major US, UK & International Media
63
+ "bbc.com", "bbc.co.uk", "nytimes.com", "washingtonpost.com", "wsj.com",
64
+ "theguardian.com", "npr.org", "pbs.org", "cnn.com", "ft.com",
65
+ "aljazeera.com", "dw.com", "france24.com", "scmp.com", "nbcnews.com",
66
+ "cbsnews.com", "abcnews.go.com", "theatlantic.com", "time.com", "economist.com",
67
+
68
+ # 🇮🇳 Indian National & Regional Heavyweights
69
+ "thehindu.com", "hindustantimes.com", "indianexpress.com", "timesofindia.indiatimes.com",
70
+ "ndtv.com", "indiatoday.in", "theprint.in", "thewire.in", "scroll.in",
71
+ "livemint.com", "business-standard.com", "deccanherald.com", "telegraphindia.com",
72
+ "tribuneindia.com", "newindianexpress.com", "firstpost.com", "thequint.com",
73
+ "cnbctv18.com", "moneycontrol.com", "aninews.in", "ptinews.com", "freepressjournal.in",
74
+
75
+ # 🔎 Dedicated Fact-Checkers (Massive Trust Boost if found)
76
+ "snopes.com", "politifact.com", "factcheck.org", "altnews.in", "boomlive.in",
77
+ "newschecker.in", "vishvasnews.com", "smhoaxinvestigator.com", "factchecker.in",
78
+
79
+ # 🌐 High-Trust Aggregators
80
+ "yahoo.com/news", "msn.com", "news.google.com"
81
+ }
82
+
83
+ # ── Low Trust / Disinformation / Satire domains ────────────────────────────
84
+ LOW_TRUST_DOMAINS = {
85
+ # ⚠️ Known Fake News, Pseudoscience & Conspiracy
86
+ "infowars.com", "naturalnews.com", "beforeitsnews.com", "thegatewaypundit.com",
87
+ "zerohedge.com", "worldnewsdailyreport.com", "nationalreport.net",
88
+
89
+ # 📢 State-Sponsored Propaganda
90
+ "rt.com", "sputniknews.com", "globaltimes.cn",
91
+
92
+ # 🇮🇳 Indian High-Bias / Frequently Flagged for Disinformation
93
+ "postcard.news", "opindia.com", "tfipost.com", "kreately.in", "rightlog.in",
94
+
95
+ # 🤡 Satire (If your engine matches these, the news is definitely fake)
96
+ "theonion.com", "babylonbee.com", "fakingnews.com", "thefauxy.com",
97
+ "thedailymash.co.uk", "waterfordwhispersnews.com", "clickhole.com"
98
+ }
99
+
100
+
101
+ def _trust_level(url: str, snippet: str = "", title: str = "") -> str:
102
+ """Evaluates trust based on URL domain AND snippet/title signatures."""
103
+ lower_url = url.lower()
104
+ lower_snippet = snippet.lower()
105
+ lower_title = title.lower()
106
+
107
+ # 1. Check URL Domains
108
+ for d in HIGH_TRUST_DOMAINS:
109
+ if d in lower_url:
110
+ return "high"
111
+
112
+ # 2. Check snippet OR title for major syndicated wire services
113
+ high_trust_keywords = ["reuters", "associated press", "bbc", "cnn", "the new york times", "bloomberg"]
114
+ for keyword in high_trust_keywords:
115
+ if keyword in lower_snippet or keyword in lower_title:
116
+ return "high"
117
+
118
+ # 3. Check for known low-trust/satire sites
119
+ for d in LOW_TRUST_DOMAINS:
120
+ if d in lower_url:
121
+ return "low"
122
+
123
+ return "medium"
124
+
125
+
126
+ # ── Locale detection for multilingual search ─────────────────────────────
127
+ _LOCALE_MAP = {
128
+ (0x0900, 0x097F): ('hi', 'IN'), # Devanagari → Hindi
129
+ (0x0980, 0x09FF): ('bn', 'IN'), # Bengali
130
+ (0x0A00, 0x0A7F): ('pa', 'IN'), # Gurmukhi → Punjabi
131
+ (0x0A80, 0x0AFF): ('gu', 'IN'), # Gujarati
132
+ (0x0B80, 0x0BFF): ('ta', 'IN'), # Tamil
133
+ (0x0C00, 0x0C7F): ('te', 'IN'), # Telugu
134
+ (0x0C80, 0x0CFF): ('kn', 'IN'), # Kannada
135
+ (0x0D00, 0x0D7F): ('ml', 'IN'), # Malayalam
136
+ (0x0600, 0x06FF): ('ar', 'AE'), # Arabic
137
+ (0x4E00, 0x9FFF): ('zh', 'CN'), # CJK → Chinese
138
+ (0x3040, 0x30FF): ('ja', 'JP'), # Hiragana/Katakana → Japanese
139
+ (0xAC00, 0xD7AF): ('ko', 'KR'), # Hangul → Korean
140
+ (0x0400, 0x04FF): ('ru', 'RU'), # Cyrillic → Russian
141
+ }
142
+
143
+
144
+ def _detect_locale(query: str) -> tuple[str, str]:
145
+ """Detect (lang, country) from the Unicode script of the first non-ASCII char."""
146
+ for c in query:
147
+ cp = ord(c)
148
+ for (lo, hi), locale in _LOCALE_MAP.items():
149
+ if lo <= cp <= hi:
150
+ return locale
151
+ return ('en', 'US') # default to English
152
+
153
+
154
+ def _fetch_google_rss(url: str, num_results: int) -> list[dict]:
155
+ """Fetch and parse a Google News RSS URL into a list of result dicts."""
156
+ print(f" 🌐 GOOGLE NEWS URL: {url}")
157
+ req = urllib.request.Request(
158
+ url,
159
+ headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)'}
160
+ )
161
+ with urllib.request.urlopen(req, timeout=10) as response:
162
+ xml_data = response.read()
163
+ root = ET.fromstring(xml_data)
164
+ results = []
165
+ for item in root.findall('.//item')[:num_results]:
166
+ title = item.find('title')
167
+ link = item.find('link')
168
+ title_text = title.text if title is not None else ""
169
+ link_text = link.text if link is not None else ""
170
+ desc = item.find('description')
171
+ desc_html = desc.text if desc is not None else ""
172
+ snippet = re.sub('<[^<]+>', '', desc_html)
173
+ results.append({"title": title_text, "href": link_text, "body": snippet})
174
+ print(f" 📰 Results found: {len(results)}")
175
+ return results
176
+
177
+
178
+ def _google_news_search(query: str, num_results: int = 8) -> list[dict]:
179
+ """
180
+ Multilingual Google News RSS search.
181
+ 1. Detect locale from query script (Hindi→hi/IN, Bengali→bn/IN, etc.)
182
+ 2. Search with detected locale
183
+ 3. Fallback: search with no locale (Google auto-detects)
184
+ 4. Fallback: slice to first 6 words and retry
185
+ """
186
+ try:
187
+ safe_query = urllib.parse.quote(query)
188
+ lang, country = _detect_locale(query)
189
+
190
+ print(f"\n{'='*50}")
191
+ print(f"🔍 GOOGLE NEWS SEARCH")
192
+ print(f" Query: {query[:80]}{'...' if len(query) > 80 else ''}")
193
+ print(f" Detected locale: hl={lang}, gl={country}")
194
+
195
+ # Attempt 1: Search with detected locale
196
+ url = f"https://news.google.com/rss/search?q={safe_query}&hl={lang}&gl={country}&ceid={country}:{lang}"
197
+ results = _fetch_google_rss(url, num_results)
198
+
199
+ # Attempt 2: No locale params → let Google infer
200
+ if not results:
201
+ print(" ⚠️ Zero results. Retrying with no locale params...")
202
+ url_nolang = f"https://news.google.com/rss/search?q={safe_query}"
203
+ results = _fetch_google_rss(url_nolang, num_results)
204
+
205
+ # Attempt 3: Query slicing → first 6 words only
206
+ if not results:
207
+ words = query.split()
208
+ if len(words) > 4:
209
+ short_query = " ".join(words[:6])
210
+ safe_short = urllib.parse.quote(short_query)
211
+ print(f" ⚠️ Still zero. Slicing to 6 words: '{short_query}'")
212
+ url_short = f"https://news.google.com/rss/search?q={safe_short}&hl={lang}&gl={country}&ceid={country}:{lang}"
213
+ results = _fetch_google_rss(url_short, num_results)
214
+
215
+ print(f" ✅ Final result count: {len(results)}")
216
+ print(f"{'='*50}\n")
217
+ return results
218
+
219
+ except Exception as exc:
220
+ logger.error("Google News search failed: %s", exc)
221
+ return []
222
+
223
+
224
+ def _wikipedia_search(query: str) -> list[dict]:
225
+ """
226
+ Multilingual Wikipedia search.
227
+ Tries English first, then falls back to the language-specific edition
228
+ if the query contains non-ASCII characters.
229
+ """
230
+ def _wiki_query(wiki_lang: str, q: str) -> list[dict]:
231
+ safe_query = urllib.parse.quote(q)
232
+ url = f"https://{wiki_lang}.wikipedia.org/w/api.php?action=query&list=search&srsearch={safe_query}&utf8=&format=json"
233
+ print(f" 📚 WIKIPEDIA URL ({wiki_lang}): {url[:120]}...")
234
+ req = urllib.request.Request(
235
+ url,
236
+ headers={'User-Agent': 'VeriLensAI/1.0 (University Fact-Checking Project)'}
237
+ )
238
+ with urllib.request.urlopen(req, timeout=10) as response:
239
+ data = json.loads(response.read().decode())
240
+ results = []
241
+ for item in data.get('query', {}).get('search', [])[:2]:
242
+ title = item['title']
243
+ clean_snippet = re.sub('<[^<]+>', '', item['snippet'])
244
+ results.append({
245
+ "title": f"{title} - Wikipedia",
246
+ "href": f"https://{wiki_lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ', '_'))}",
247
+ "body": clean_snippet
248
+ })
249
+ print(f" 📚 Wikipedia ({wiki_lang}) results: {len(results)}")
250
+ return results
251
+
252
+ try:
253
+ # 1. Try English Wikipedia first
254
+ results = _wiki_query('en', query)
255
+
256
+ # 2. If 0 results and query contains non-ASCII, detect language Wikipedia
257
+ if not results and any(ord(c) > 127 for c in query):
258
+ detected_lang, _ = _detect_locale(query)
259
+ if detected_lang != 'en':
260
+ logger.info(f"Retrying Wikipedia with lang={detected_lang} for non-ASCII query")
261
+ results = _wiki_query(detected_lang, query)
262
+
263
+ return results
264
+ except Exception as exc:
265
+ logger.error("Wikipedia search failed: %s", exc)
266
+ return []
267
+
268
+
269
+ async def _search_web(query: str, num_results: int = 8) -> list[dict]:
270
+ """Search the web for news AND historical facts concurrently, with short-query fallback."""
271
+
272
+ # Run Google News and Wikipedia at the exact same time
273
+ news_task = asyncio.to_thread(_google_news_search, query, num_results)
274
+ wiki_task = asyncio.to_thread(_wikipedia_search, query)
275
+
276
+ # Wait for both to finish
277
+ news_results, wiki_results = await asyncio.gather(news_task, wiki_task)
278
+
279
+ # Allocate half the quota to each source to ensure balanced verification
280
+ half_quota = num_results // 2
281
+ balanced_results = news_results[:half_quota] + wiki_results[:num_results - half_quota]
282
+
283
+ # If Wiki returned fewer results than its quota, fill the gap with more news
284
+ if len(balanced_results) < num_results:
285
+ remaining_slots = num_results - len(balanced_results)
286
+ balanced_results.extend(news_results[half_quota:half_quota + remaining_slots])
287
+
288
+ # 🔄 SHORT-QUERY FALLBACK: If 0 results, retry with just the first 6 words
289
+ if not balanced_results:
290
+ words = query.split()
291
+ if len(words) > 4:
292
+ short_query = " ".join(words[:6])
293
+ logger.info(f"Zero results for full query. Retrying with short query: '{short_query}'")
294
+ news_task2 = asyncio.to_thread(_google_news_search, short_query, num_results)
295
+ wiki_task2 = asyncio.to_thread(_wikipedia_search, short_query)
296
+ news2, wiki2 = await asyncio.gather(news_task2, wiki_task2)
297
+ balanced_results = news2[:half_quota] + wiki2[:num_results - half_quota]
298
+ if len(balanced_results) < num_results:
299
+ remaining_slots = num_results - len(balanced_results)
300
+ balanced_results.extend(news2[half_quota:half_quota + remaining_slots])
301
+
302
+ return balanced_results
303
+
304
+
305
+ # NLI Entailment threshold — much stricter than old STS similarity.
306
+ # Only sources whose articles genuinely ENTAIL the claim will pass.
307
+ MIN_RELEVANCE_THRESHOLD = 0.75
308
+
309
+ # Label mapping for MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7
310
+ # Index 0 = Entailment, Index 1 = Neutral, Index 2 = Contradiction
311
+ _NLI_ENTAILMENT_IDX = 0
312
+
313
+
314
+ def _softmax(logits: np.ndarray) -> np.ndarray:
315
+ """Numerically-stable softmax over the last axis."""
316
+ exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
317
+ return exp / np.sum(exp, axis=-1, keepdims=True)
318
+
319
+
320
+ def _compute_per_source_similarity(text: str, snippets: list[str]) -> list[float]:
321
+ """
322
+ Compute strict semantic entailment using an NLI Cross-Encoder.
323
+
324
+ The model outputs raw logits for [Contradiction, Entailment, Neutral].
325
+ We apply softmax and return the Entailment probability (0.0 → 1.0)
326
+ so that keyword-overlap alone can no longer fool the system.
327
+ """
328
+ model = _get_cross_model()
329
+ if model is None or not snippets:
330
+ return [0.0] * len(snippets)
331
+
332
+ try:
333
+ # Cross-Encoders take PAIRS: (premise=article, hypothesis=claim)
334
+ pairs = [[snippet[:512], text[:512]] for snippet in snippets]
335
+
336
+ # NLI models return raw logits of shape (N, 3)
337
+ logits = model.predict(pairs)
338
+ logits = np.array(logits)
339
+
340
+ # Ensure 2-D even for a single pair
341
+ if logits.ndim == 1:
342
+ logits = logits.reshape(1, -1)
343
+
344
+ # Softmax → probabilities, then grab the Entailment column
345
+ probs = _softmax(logits)
346
+ entailment_scores = probs[:, _NLI_ENTAILMENT_IDX]
347
+
348
+ return [float(s) for s in entailment_scores]
349
+ except Exception as exc:
350
+ logger.error("NLI entailment computation failed: %s", exc)
351
+ return [0.0] * len(snippets)
352
+
353
+
354
+ async def verify_claim(text: str, search_query: str) -> VerificationResult:
355
+ """
356
+ Search the internet for articles related to *search_query*,
357
+ compute per-source semantic entailment, and discard irrelevant results.
358
+ """
359
+ items = await _search_web(search_query)
360
+
361
+ if not items:
362
+ return VerificationResult(similarity_score=0.0, sources=[], verified=False)
363
+
364
+ # Build candidate lists
365
+ candidates: list[SourceArticle] = []
366
+ snippets: list[str] = []
367
+
368
+ # 🔥 THE FIX: Removed the [:8] slice so Wikipedia actually gets processed!
369
+ for item in items:
370
+ title = item.get("title", "")
371
+ link = item.get("url", "") or item.get("href", "")
372
+ snippet = item.get("body", "")
373
+
374
+ candidates.append(
375
+ SourceArticle(
376
+ title=title,
377
+ url=link,
378
+ snippet=snippet,
379
+ trust=_trust_level(url=link, snippet=snippet, title=title),
380
+ )
381
+ )
382
+ snippets.append(f"{title}. {snippet}")
383
+
384
+ # Compute per-source similarity scores using the new Cross-Encoder
385
+ scores = await asyncio.to_thread(_compute_per_source_similarity, text, snippets)
386
+
387
+ # Filter: only keep sources above the relevance threshold
388
+ sources: list[SourceArticle] = []
389
+ relevant_scores: list[float] = []
390
+
391
+ # 🔎 X-RAY VISION: Print the AI's exact math to the backend terminal
392
+ print("\n" + "="*50)
393
+ print("🧠 CROSS-ENCODER SCORES:")
394
+
395
+ for candidate, score in zip(candidates, scores):
396
+ print(f"Score: {score:.3f} | Source: {candidate.url}")
397
+
398
+ # 🏛️ THE WIKIPEDIA VIP PASS 🏛️
399
+ if "wikipedia.org" in candidate.url:
400
+ required_score = 0.45 # Lower bar for encyclopedic context, but high enough to reject noise
401
+ else:
402
+ required_score = MIN_RELEVANCE_THRESHOLD # 0.75 strict NLI entailment for news
403
+
404
+ if score >= required_score:
405
+ sources.append(candidate)
406
+ relevant_scores.append(score)
407
+ print(f" -> ✅ ACCEPTED (Requires >= {required_score})")
408
+ else:
409
+ print(f" -> ❌ REJECTED (Requires >= {required_score})")
410
+
411
+ print("="*50 + "\n")
412
+
413
+ if not sources:
414
+ return VerificationResult(similarity_score=0.0, sources=[], verified=True)
415
+
416
+ avg_similarity = sum(relevant_scores) / len(relevant_scores)
417
+
418
+ return VerificationResult(
419
+ similarity_score=round(avg_similarity, 4),
420
+ sources=sources,
421
+ verified=True,
422
+ )