Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -8,6 +8,7 @@ import hashlib
|
|
| 8 |
import logging
|
| 9 |
import re
|
| 10 |
import time
|
|
|
|
| 11 |
from contextlib import asynccontextmanager
|
| 12 |
from datetime import datetime, timedelta
|
| 13 |
import random
|
|
@@ -21,9 +22,11 @@ from pydantic import BaseModel, Field
|
|
| 21 |
from model import classify, load_model
|
| 22 |
from nlp_utils import build_search_query, detect_language, detect_suspicious_phrases, extract_keywords
|
| 23 |
from scraper import extract_article
|
| 24 |
-
from verifier import verify_claim
|
| 25 |
from decision_engine import make_decision
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
# ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s")
|
| 29 |
logger = logging.getLogger("verilens")
|
|
@@ -67,40 +70,34 @@ class SourceOut(BaseModel):
|
|
| 67 |
snippet: str
|
| 68 |
trust: str
|
| 69 |
|
| 70 |
-
# ββ NEW: Origin & Mutation Map schemas βββββββββββββββββββββββββββββββββββ
|
| 71 |
class OriginNode(BaseModel):
|
| 72 |
-
"""A node on the Origin & Mutation Map (newspaper clipping)."""
|
| 73 |
id: str
|
| 74 |
-
node_type: str
|
| 75 |
-
source_type: str
|
| 76 |
-
author: str
|
| 77 |
-
timestamp: str
|
| 78 |
-
snippet: str
|
| 79 |
-
url: str
|
| 80 |
|
| 81 |
class MutationConnection(BaseModel):
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
nli_score: int # percentage, e.g. 98
|
| 87 |
|
| 88 |
class GroundTruthItem(BaseModel):
|
| 89 |
-
"""One item in the evidence analysis list."""
|
| 90 |
index: int
|
| 91 |
text: str
|
| 92 |
-
badge: str
|
| 93 |
|
| 94 |
class GroundTruthData(BaseModel):
|
| 95 |
-
|
| 96 |
-
established_fact: str # The corrective summary
|
| 97 |
evidence_items: list[GroundTruthItem]
|
| 98 |
|
| 99 |
class OriginMapData(BaseModel):
|
| 100 |
nodes: list[OriginNode]
|
| 101 |
connections: list[MutationConnection]
|
| 102 |
|
| 103 |
-
# ββ NEW: Frontend-compatible schemas (matches React sampleAnalysis) ββββββ
|
| 104 |
class FrontendAnnotation(BaseModel):
|
| 105 |
type: Literal['contradiction', 'fallacy', 'unverified', 'verified']
|
| 106 |
note: str
|
|
@@ -125,7 +122,7 @@ class FrontendEvidenceNode(BaseModel):
|
|
| 125 |
class FrontendConnection(BaseModel):
|
| 126 |
from_field: str = Field(alias="from", serialization_alias="from")
|
| 127 |
to: str
|
| 128 |
-
nli: dict
|
| 129 |
|
| 130 |
model_config = {"populate_by_name": True}
|
| 131 |
|
|
@@ -140,19 +137,17 @@ class AnalyzeResponse(BaseModel):
|
|
| 140 |
suspicious: dict
|
| 141 |
factors: dict
|
| 142 |
elapsed_ms: int
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
ground_truth: GroundTruthData # established fact + evidence items
|
| 148 |
-
# ββ Frontend-compatible fields (React components) ββββββββββββββββββββ
|
| 149 |
claim: str
|
| 150 |
verdict: Literal['VERIFIED', 'FABRICATED', 'INCONCLUSIVE']
|
| 151 |
segments: list[FrontendSegment]
|
| 152 |
sourceTree: list[FrontendEvidenceNode]
|
| 153 |
connections: list[FrontendConnection]
|
| 154 |
-
groundTruth: str
|
| 155 |
-
confidenceExplanation: str
|
| 156 |
|
| 157 |
|
| 158 |
# ββ Helpers: build supplementary data from existing signals ββββββββββββββ
|
|
@@ -166,38 +161,25 @@ _NODE_TYPES_HOSTILE = ["FORUM POST", "ANONYMOUS TIP", "CHAN BOARD", "DARK WEB PO
|
|
| 166 |
_NODE_TYPES_AMP = ["SOCIAL MEDIA", "BLOG", "REPOST", "VIRAL TWEET"]
|
| 167 |
|
| 168 |
def _generate_case_number(text: str) -> str:
|
| 169 |
-
"""Deterministic case number from input hash."""
|
| 170 |
h = hashlib.md5(text.encode()).hexdigest()
|
| 171 |
num = int(h[:6], 16) % 999999
|
| 172 |
return f"TB-{num:06d}"
|
| 173 |
|
| 174 |
def _build_origin_map(sources: list, verification_score: float, text: str) -> OriginMapData:
|
| 175 |
-
"""
|
| 176 |
-
Build the Origin & Mutation Map from existing source data.
|
| 177 |
-
Maps sources into Hostile Actor / Amplifier / Current Claim nodes
|
| 178 |
-
and creates NLI connections between them.
|
| 179 |
-
"""
|
| 180 |
nodes: list[OriginNode] = []
|
| 181 |
connections: list[MutationConnection] = []
|
| 182 |
-
|
| 183 |
now = datetime.now()
|
| 184 |
-
rng = random.Random(hash(text))
|
| 185 |
|
| 186 |
if not sources:
|
| 187 |
-
# Even with no sources, show the current claim node
|
| 188 |
nodes.append(OriginNode(
|
| 189 |
-
id="claim_0",
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
author="USER SUBMISSION",
|
| 193 |
-
timestamp=now.strftime("%Y-%m-%d %H:%M"),
|
| 194 |
-
snippet=text[:120] + ("β¦" if len(text) > 120 else ""),
|
| 195 |
-
url="",
|
| 196 |
))
|
| 197 |
return OriginMapData(nodes=nodes, connections=connections)
|
| 198 |
|
| 199 |
-
|
| 200 |
-
for i, src in enumerate(sources[:4]): # max 4 nodes on the map
|
| 201 |
if src.trust == "low":
|
| 202 |
ntype = "hostile_actor"
|
| 203 |
stype = rng.choice(_NODE_TYPES_HOSTILE)
|
|
@@ -209,7 +191,6 @@ def _build_origin_map(sources: list, verification_score: float, text: str) -> Or
|
|
| 209 |
else:
|
| 210 |
ntype = "current_claim"
|
| 211 |
stype = "MAJOR NEWS OUTLET"
|
| 212 |
-
# Extract outlet name from title
|
| 213 |
author = src.title.split(" - ")[-1] if " - " in src.title else src.title[:30]
|
| 214 |
|
| 215 |
days_ago = rng.randint(1, 14)
|
|
@@ -218,67 +199,41 @@ def _build_origin_map(sources: list, verification_score: float, text: str) -> Or
|
|
| 218 |
ts = (now - timedelta(days=days_ago)).replace(hour=hours, minute=minutes)
|
| 219 |
|
| 220 |
nodes.append(OriginNode(
|
| 221 |
-
id=f"node_{i}",
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
author=author,
|
| 225 |
-
timestamp=ts.strftime("%Y-%m-%d %H:%M"),
|
| 226 |
-
snippet=src.snippet[:150] if src.snippet else src.title,
|
| 227 |
-
url=src.url,
|
| 228 |
))
|
| 229 |
|
| 230 |
-
# Create connections between sequential nodes with NLI scores
|
| 231 |
for i in range(len(nodes) - 1):
|
| 232 |
-
# Derive NLI label from verification score + source trust
|
| 233 |
score_base = int(verification_score * 100) if verification_score else 50
|
| 234 |
jitter = rng.randint(-15, 15)
|
| 235 |
nli_score = max(10, min(99, score_base + jitter))
|
| 236 |
|
| 237 |
-
# High scores on high-trust = ENTAILMENT, low trust = CONTRADICTION
|
| 238 |
src_trust = sources[i].trust if i < len(sources) else "medium"
|
| 239 |
if src_trust == "low":
|
| 240 |
nli_label = "CONTRADICTION"
|
| 241 |
-
nli_score = max(70, nli_score)
|
| 242 |
elif nli_score >= 60:
|
| 243 |
nli_label = "ENTAILMENT"
|
| 244 |
else:
|
| 245 |
nli_label = "CONTRADICTION"
|
| 246 |
|
| 247 |
connections.append(MutationConnection(
|
| 248 |
-
from_node=nodes[i].id,
|
| 249 |
-
|
| 250 |
-
nli_label=nli_label,
|
| 251 |
-
nli_score=nli_score,
|
| 252 |
))
|
| 253 |
|
| 254 |
return OriginMapData(nodes=nodes, connections=connections)
|
| 255 |
|
| 256 |
|
| 257 |
-
def _build_ground_truth(
|
| 258 |
-
prediction: str,
|
| 259 |
-
explanation: str,
|
| 260 |
-
suspicious: dict,
|
| 261 |
-
keywords: list[str],
|
| 262 |
-
sources: list,
|
| 263 |
-
) -> GroundTruthData:
|
| 264 |
-
"""Build the Established Fact + Evidence Analysis from existing signals."""
|
| 265 |
-
|
| 266 |
-
# The established fact is derived from the AI explanation
|
| 267 |
if prediction == "Fake":
|
| 268 |
-
established_fact = (
|
| 269 |
-
f"Based on cross-referencing {len(sources)} sources and NLI entailment analysis, "
|
| 270 |
-
f"this claim could not be substantiated. {explanation}"
|
| 271 |
-
)
|
| 272 |
elif prediction == "Real":
|
| 273 |
-
established_fact = (
|
| 274 |
-
f"This claim has been corroborated by {len(sources)} independent sources. {explanation}"
|
| 275 |
-
)
|
| 276 |
else:
|
| 277 |
-
established_fact = (
|
| 278 |
-
f"Verification produced mixed results across {len(sources)} sources. {explanation}"
|
| 279 |
-
)
|
| 280 |
|
| 281 |
-
# Build evidence items from suspicious phrases + source data
|
| 282 |
items: list[GroundTruthItem] = []
|
| 283 |
idx = 1
|
| 284 |
|
|
@@ -298,231 +253,111 @@ def _build_ground_truth(
|
|
| 298 |
items.append(GroundTruthItem(index=idx, text=f'Unsupported attribution: "{phrase}"', badge="UNVERIFIED"))
|
| 299 |
idx += 1
|
| 300 |
|
| 301 |
-
# Add source-based evidence
|
| 302 |
high_trust_sources = [s for s in sources if s.trust == "high"]
|
| 303 |
low_trust_sources = [s for s in sources if s.trust == "low"]
|
| 304 |
|
| 305 |
if high_trust_sources:
|
| 306 |
-
items.append(GroundTruthItem(
|
| 307 |
-
index=idx,
|
| 308 |
-
text=f"Corroborated by {len(high_trust_sources)} high-trust source(s): {high_trust_sources[0].title[:60]}",
|
| 309 |
-
badge="CORROBORATED",
|
| 310 |
-
))
|
| 311 |
idx += 1
|
| 312 |
|
| 313 |
if low_trust_sources:
|
| 314 |
-
items.append(GroundTruthItem(
|
| 315 |
-
index=idx,
|
| 316 |
-
text=f"Found in {len(low_trust_sources)} low-trust source(s) β possible disinformation origin",
|
| 317 |
-
badge="CONTRADICTION",
|
| 318 |
-
))
|
| 319 |
idx += 1
|
| 320 |
|
| 321 |
if not items:
|
| 322 |
-
items.append(GroundTruthItem(
|
| 323 |
-
index=1,
|
| 324 |
-
text="No specific evidence markers detected in the text",
|
| 325 |
-
badge="UNVERIFIED",
|
| 326 |
-
))
|
| 327 |
|
| 328 |
return GroundTruthData(established_fact=established_fact, evidence_items=items)
|
| 329 |
|
| 330 |
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
_SOURCE_LAYOUT_NEWS = [
|
| 336 |
-
(20.0, 30.0, -2),
|
| 337 |
-
(50.0, 80.0, 3),
|
| 338 |
-
(15.0, 60.0, 1),
|
| 339 |
-
(60.0, 45.0, -3),
|
| 340 |
-
]
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
def _build_direct_source_tree(
|
| 344 |
-
text: str,
|
| 345 |
-
sources: list,
|
| 346 |
-
verification_score: float,
|
| 347 |
-
per_source_scores: list[float] | None = None,
|
| 348 |
-
) -> tuple[list[FrontendEvidenceNode], list[FrontendConnection]]:
|
| 349 |
-
"""
|
| 350 |
-
Build the Evidence Board directly from verification sources.
|
| 351 |
-
Ensures a diverse mix of Wikipedia (historical) + news sources.
|
| 352 |
-
Always produces β₯1 node (the claim). With sources β β₯3 nodes.
|
| 353 |
-
Returns (sourceTree, connections).
|
| 354 |
-
"""
|
| 355 |
now = datetime.now()
|
| 356 |
rng = random.Random(hash(text))
|
| 357 |
nodes: list[FrontendEvidenceNode] = []
|
| 358 |
conns: list[FrontendConnection] = []
|
| 359 |
|
| 360 |
-
# ββ Node 1: The Claim (always present) βββββββββββββββββββββββββββββββ
|
| 361 |
claim_node = FrontendEvidenceNode(
|
| 362 |
-
id="claim_0",
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
date=now.strftime("%Y-%m-%d %H:%M"),
|
| 366 |
-
author="SUBMITTED CLAIM",
|
| 367 |
-
content=text[:150] + ("β¦" if len(text) > 150 else ""),
|
| 368 |
-
x=50.0,
|
| 369 |
-
y=75.0,
|
| 370 |
-
rotation=2,
|
| 371 |
)
|
| 372 |
nodes.append(claim_node)
|
| 373 |
|
| 374 |
if not sources:
|
| 375 |
return nodes, conns
|
| 376 |
|
| 377 |
-
# ββ Separate Wikipedia (historical) from news sources ββββββββββββββββ
|
| 378 |
wiki_sources = [s for s in sources if "wikipedia.org" in s.url]
|
| 379 |
news_sources = [s for s in sources if "wikipedia.org" not in s.url]
|
|
|
|
| 380 |
|
| 381 |
-
# Build ordered list: Wikipedia first, then news, ensuring rich diversity
|
| 382 |
-
ordered: list[tuple] = [] # (source, layout_x, layout_y, layout_rot, source_type_label)
|
| 383 |
-
|
| 384 |
-
# Always include Wikipedia if available
|
| 385 |
for ws in wiki_sources[:1]:
|
| 386 |
x, y, rot = _SOURCE_LAYOUT_WIKI
|
| 387 |
ordered.append((ws, x, y, rot, "Historical Archive"))
|
| 388 |
|
| 389 |
-
# Always include at least 2 news articles
|
| 390 |
news_idx = 0
|
| 391 |
for ns in news_sources[:3]:
|
| 392 |
x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
|
| 393 |
ordered.append((ns, x, y, rot, "News Article"))
|
| 394 |
news_idx += 1
|
| 395 |
|
| 396 |
-
# If we still have < 3 sources, fill with remaining Wikipedia
|
| 397 |
if len(ordered) < 3:
|
| 398 |
for ws in wiki_sources[1:3 - len(ordered) + 1]:
|
| 399 |
x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
|
| 400 |
ordered.append((ws, x, y, rot, "Historical Archive"))
|
| 401 |
news_idx += 1
|
| 402 |
|
| 403 |
-
# ββ Build nodes + connections for each source ββββββββββββββββββββββββ
|
| 404 |
-
# Build a score lookup for per-source NLI
|
| 405 |
source_score_map: dict[str, float] = {}
|
| 406 |
if per_source_scores and len(per_source_scores) == len(sources):
|
| 407 |
for s, sc in zip(sources, per_source_scores):
|
| 408 |
source_score_map[s.url] = sc
|
| 409 |
|
| 410 |
for i, (src, x, y, rot, type_label) in enumerate(ordered[:4]):
|
| 411 |
-
|
| 412 |
-
if src.
|
| 413 |
-
role = "hostile"
|
| 414 |
-
else:
|
| 415 |
-
role = "amplifier"
|
| 416 |
-
|
| 417 |
-
# Extract a readable author name
|
| 418 |
-
if " - " in src.title:
|
| 419 |
-
author = src.title.split(" - ")[-1].strip()[:30]
|
| 420 |
-
elif "wikipedia.org" in src.url:
|
| 421 |
-
author = "WIKIPEDIA"
|
| 422 |
-
else:
|
| 423 |
-
author = src.title[:30] if src.title else "Unknown Source"
|
| 424 |
|
| 425 |
days_ago = rng.randint(1, 14)
|
| 426 |
ts = (now - timedelta(days=days_ago)).strftime("%Y-%m-%d %H:%M")
|
| 427 |
node_id = f"source_{i + 1}"
|
| 428 |
|
| 429 |
nodes.append(FrontendEvidenceNode(
|
| 430 |
-
id=node_id,
|
| 431 |
-
|
| 432 |
-
type=type_label,
|
| 433 |
-
date=ts,
|
| 434 |
-
author=author,
|
| 435 |
-
content=src.snippet[:150] if src.snippet else src.title,
|
| 436 |
-
x=x,
|
| 437 |
-
y=y,
|
| 438 |
-
rotation=rot,
|
| 439 |
-
url=src.url if src.url else None,
|
| 440 |
))
|
| 441 |
|
| 442 |
-
# ββ Connection: source β claim with per-source NLI βββββββββββββββ
|
| 443 |
src_score = source_score_map.get(src.url, verification_score)
|
| 444 |
nli_type = "entailment" if src_score >= 0.65 else "contradiction"
|
| 445 |
nli_score = max(10, min(99, int(src_score * 100)))
|
| 446 |
|
| 447 |
-
conns.append(FrontendConnection(
|
| 448 |
-
from_field=node_id,
|
| 449 |
-
to="claim_0",
|
| 450 |
-
nli={"type": nli_type, "score": nli_score},
|
| 451 |
-
))
|
| 452 |
|
| 453 |
return nodes, conns
|
| 454 |
|
| 455 |
|
| 456 |
def _extract_ground_truth_string(sources: list) -> str:
|
| 457 |
-
|
| 458 |
-
if not sources:
|
| 459 |
-
return "No established fact could be determined from available sources."
|
| 460 |
-
|
| 461 |
-
# Prefer Wikipedia first
|
| 462 |
for s in sources:
|
| 463 |
-
if "wikipedia.org" in s.url:
|
| 464 |
-
return s.snippet[:300] if s.snippet else s.title
|
| 465 |
-
|
| 466 |
-
# Then any high-trust source
|
| 467 |
for s in sources:
|
| 468 |
-
if s.trust == "high" and s.snippet:
|
| 469 |
-
return s.snippet[:300]
|
| 470 |
-
|
| 471 |
-
# Fallback to first source with a snippet
|
| 472 |
for s in sources:
|
| 473 |
-
if s.snippet:
|
| 474 |
-
return s.snippet[:300]
|
| 475 |
-
|
| 476 |
return "No established fact could be determined from available sources."
|
| 477 |
|
| 478 |
|
| 479 |
-
def _build_segments(
|
| 480 |
-
text: str,
|
| 481 |
-
suspicious: dict,
|
| 482 |
-
ground_truth: GroundTruthData,
|
| 483 |
-
ml_label: str = "",
|
| 484 |
-
ml_confidence: float = 0.0,
|
| 485 |
-
) -> list[FrontendSegment]:
|
| 486 |
-
"""
|
| 487 |
-
Split the claim text into annotated segments.
|
| 488 |
-
Prepends a Linguistic Analysis segment with the ML model's reasoning,
|
| 489 |
-
then uses suspicious phrase detection + ground truth evidence.
|
| 490 |
-
"""
|
| 491 |
segments: list[FrontendSegment] = []
|
| 492 |
-
|
| 493 |
-
# ββ Segment 0: ML Model Linguistic Analysis ββββββββββββββββββββββββββ
|
| 494 |
if ml_label:
|
| 495 |
ml_label_display = ml_label.upper()
|
| 496 |
ml_pct = int(ml_confidence * 100)
|
| 497 |
-
if ml_label_display == "FAKE":
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
f"emotional manipulation, or patterns consistent with disinformation."
|
| 502 |
-
)
|
| 503 |
-
elif ml_label_display == "REAL":
|
| 504 |
-
ml_note = (
|
| 505 |
-
f"The local NLP model analyzed the linguistic syntax and scored "
|
| 506 |
-
f"this claim at {ml_pct}% REAL β professional journalistic tone "
|
| 507 |
-
f"detected with minimal sensationalist markers."
|
| 508 |
-
)
|
| 509 |
-
else:
|
| 510 |
-
ml_note = (
|
| 511 |
-
f"The local NLP model analyzed the linguistic syntax but could "
|
| 512 |
-
f"not reach a definitive conclusion (confidence: {ml_pct}%). "
|
| 513 |
-
f"The text contains a mix of professional and informal language patterns."
|
| 514 |
-
)
|
| 515 |
-
segments.append(FrontendSegment(
|
| 516 |
-
text=f"[LINGUISTIC ANALYSIS] ",
|
| 517 |
-
isSuspicious=True,
|
| 518 |
-
annotation=FrontendAnnotation(type="unverified", note=ml_note),
|
| 519 |
-
))
|
| 520 |
-
|
| 521 |
-
# ββ Collect evidence items as potential annotations βββββββββββββββββββ
|
| 522 |
-
evidence_annotations: list[tuple[str, str]] = []
|
| 523 |
-
for item in ground_truth.evidence_items:
|
| 524 |
-
evidence_annotations.append((item.badge, item.text))
|
| 525 |
|
|
|
|
| 526 |
sus_phrases: list[str] = []
|
| 527 |
for key in ["clickbait_phrases", "emotional_language", "unsupported_claims"]:
|
| 528 |
sus_phrases.extend(suspicious.get(key, []))
|
|
@@ -533,26 +368,15 @@ def _build_segments(
|
|
| 533 |
segments.append(FrontendSegment(text=text, isSuspicious=False))
|
| 534 |
return segments
|
| 535 |
|
| 536 |
-
badge_to_annotation_type = {
|
| 537 |
-
"FALLACY": "fallacy",
|
| 538 |
-
"UNVERIFIED": "unverified",
|
| 539 |
-
"CONTRADICTION": "contradiction",
|
| 540 |
-
"CORROBORATED": "verified",
|
| 541 |
-
}
|
| 542 |
-
|
| 543 |
evidence_idx = 0
|
| 544 |
|
| 545 |
for sentence in sentences:
|
| 546 |
sentence_text = sentence.strip()
|
| 547 |
-
if not sentence_text:
|
| 548 |
-
|
| 549 |
-
if not sentence_text.endswith(" "):
|
| 550 |
-
sentence_text += " "
|
| 551 |
-
|
| 552 |
is_sus = any(phrase.lower() in sentence_text.lower() for phrase in sus_phrases)
|
| 553 |
-
|
| 554 |
-
if not is_sus and evidence_idx < len(evidence_annotations) and len(sentences) <= 5:
|
| 555 |
-
is_sus = True
|
| 556 |
|
| 557 |
annotation = None
|
| 558 |
if is_sus and evidence_idx < len(evidence_annotations):
|
|
@@ -561,76 +385,31 @@ def _build_segments(
|
|
| 561 |
annotation = FrontendAnnotation(type=ann_type, note=note)
|
| 562 |
evidence_idx += 1
|
| 563 |
|
| 564 |
-
segments.append(FrontendSegment(
|
| 565 |
-
text=sentence_text,
|
| 566 |
-
isSuspicious=is_sus and annotation is not None,
|
| 567 |
-
annotation=annotation,
|
| 568 |
-
))
|
| 569 |
|
| 570 |
return segments
|
| 571 |
|
| 572 |
|
| 573 |
-
def _build_confidence_explanation(
|
| 574 |
-
ml_label: str,
|
| 575 |
-
ml_confidence: float,
|
| 576 |
-
similarity_score: float,
|
| 577 |
-
num_sources: int,
|
| 578 |
-
high_trust_count: int,
|
| 579 |
-
low_trust_count: int,
|
| 580 |
-
final_prediction: str,
|
| 581 |
-
final_confidence: int,
|
| 582 |
-
wiki_verified: bool,
|
| 583 |
-
) -> str:
|
| 584 |
-
"""Build a highly detailed, analytical explanation of how the confidence score was derived."""
|
| 585 |
parts: list[str] = []
|
| 586 |
-
|
| 587 |
-
# ββ 1. ML Model analysis βββββββββββββββββββββββββββββββββββββββββββββ
|
| 588 |
ml_pct = int(ml_confidence * 100)
|
| 589 |
-
parts.append(
|
| 590 |
-
|
| 591 |
-
f"classified the text as {ml_label.upper()} with {ml_pct}% internal "
|
| 592 |
-
f"confidence after analyzing syntax patterns, sensationalist markers, "
|
| 593 |
-
f"and journalistic tone indicators."
|
| 594 |
-
)
|
| 595 |
-
|
| 596 |
-
# ββ 2. Cross-Encoder verification ββββββββββββββββββββββββββββββββββββ
|
| 597 |
sim_pct = int(similarity_score * 100)
|
| 598 |
threshold_met = "PASSED" if similarity_score >= 0.65 else "FAILED"
|
| 599 |
-
parts.append(
|
| 600 |
-
|
| 601 |
-
f"{num_sources} source(s). The Cross-Encoder semantic similarity scored "
|
| 602 |
-
f"{sim_pct}% against the 65% entailment threshold ({threshold_met}). "
|
| 603 |
-
f"{'Wikipedia independently corroborated the claim.' if wiki_verified else 'No Wikipedia corroboration was found.'}"
|
| 604 |
-
)
|
| 605 |
-
|
| 606 |
-
# ββ 3. Source trust breakdown βββββββββββββββββββββββββββββββββββββββββ
|
| 607 |
medium_trust = num_sources - high_trust_count - low_trust_count
|
| 608 |
-
parts.append(
|
| 609 |
-
|
| 610 |
-
f"{high_trust_count} rated HIGH trust, {medium_trust} rated MEDIUM, "
|
| 611 |
-
f"and {low_trust_count} rated LOW. "
|
| 612 |
-
f"{'A strong evidence base supports this verdict.' if high_trust_count >= 2 else 'The evidence base is limited, which affects overall confidence.'}"
|
| 613 |
-
)
|
| 614 |
-
|
| 615 |
-
# ββ 4. Guardrail activations βββββββββββββββββββββββββββββββββββββββββ
|
| 616 |
guardrails: list[str] = []
|
| 617 |
-
if num_sources == 0:
|
| 618 |
-
|
| 619 |
-
if final_prediction == "Uncertain" and similarity_score < 0.78 and not wiki_verified:
|
| 620 |
-
guardrails.append("MUDDY WATERS GUARDRAIL (weak corroboration, verdict shifted to INCONCLUSIVE)")
|
| 621 |
|
| 622 |
-
if guardrails:
|
| 623 |
-
|
| 624 |
-
else:
|
| 625 |
-
parts.append("STEP 4 β GUARDRAILS: No safety overrides were triggered. The verdict reflects the raw analysis.")
|
| 626 |
-
|
| 627 |
-
# ββ 5. Final synthesis βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 628 |
-
parts.append(
|
| 629 |
-
f"FINAL SYNTHESIS: Combining the ML model's {ml_label.upper()} signal, "
|
| 630 |
-
f"the {sim_pct}% semantic match, and {num_sources} source(s), the system "
|
| 631 |
-
f"arrived at a final confidence of {final_confidence}%."
|
| 632 |
-
)
|
| 633 |
|
|
|
|
| 634 |
return " βΈ ".join(parts)
|
| 635 |
|
| 636 |
|
|
@@ -646,14 +425,52 @@ async def analyze(req: AnalyzeRequest):
|
|
| 646 |
raise HTTPException(status_code=400, detail="Input cannot be empty.")
|
| 647 |
|
| 648 |
t0 = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
|
| 650 |
if _is_url(raw):
|
| 651 |
input_type = "URL"
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
else:
|
| 658 |
input_type = "TEXT"
|
| 659 |
text = raw
|
|
@@ -663,104 +480,111 @@ async def analyze(req: AnalyzeRequest):
|
|
| 663 |
suspicious = detect_suspicious_phrases(text)
|
| 664 |
search_query = build_search_query(text)
|
| 665 |
|
| 666 |
-
|
| 667 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 668 |
|
| 669 |
high_trust = sum(1 for s in verification.sources if s.trust == "high")
|
| 670 |
low_trust = sum(1 for s in verification.sources if s.trust == "low")
|
| 671 |
|
| 672 |
-
# ββ Decision ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 673 |
decision = make_decision(
|
| 674 |
-
ml_label=ml_result.label,
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
sources_verified=verification.verified,
|
| 678 |
-
suspicious_info=suspicious,
|
| 679 |
-
high_trust_count=high_trust,
|
| 680 |
-
low_trust_count=low_trust,
|
| 681 |
)
|
| 682 |
|
| 683 |
-
final_prediction = str(decision.prediction).title()
|
| 684 |
final_confidence = int(decision.confidence)
|
| 685 |
final_explanation = str(decision.explanation)
|
| 686 |
-
# π΅οΈ Check if Wikipedia is one of the verified sources
|
| 687 |
wiki_verified = any("wikipedia.org" in s.url for s in verification.sources)
|
| 688 |
|
| 689 |
-
#
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
|
|
|
|
|
|
| 693 |
final_prediction = "Fake"
|
| 694 |
-
final_confidence =
|
| 695 |
-
final_explanation = "
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
final_prediction = "
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
source_outs = [SourceOut(title=s.title, url=s.url, snippet=s.snippet, trust=s.trust)
|
| 708 |
-
for s in verification.sources]
|
| 709 |
-
|
| 710 |
verdict_label = _VERDICT_MAP.get(final_prediction, "UNDER REVIEW")
|
| 711 |
case_number = _generate_case_number(text)
|
| 712 |
origin_map = _build_origin_map(verification.sources, verification.similarity_score, text)
|
| 713 |
-
ground_truth = _build_ground_truth(
|
| 714 |
-
final_prediction, final_explanation, suspicious, keywords, verification.sources
|
| 715 |
-
)
|
| 716 |
|
| 717 |
-
# ββ Build frontend-compatible structures βββββββββββββββββββββββββββββ
|
| 718 |
frontend_verdict = _FRONTEND_VERDICT_MAP.get(final_prediction, "INCONCLUSIVE")
|
| 719 |
-
frontend_source_tree, frontend_connections = _build_direct_source_tree(
|
| 720 |
-
|
| 721 |
-
)
|
| 722 |
-
frontend_segments = _build_segments(
|
| 723 |
-
text, suspicious, ground_truth,
|
| 724 |
-
ml_label=ml_result.label, ml_confidence=ml_result.confidence,
|
| 725 |
-
)
|
| 726 |
ground_truth_string = _extract_ground_truth_string(verification.sources)
|
| 727 |
|
| 728 |
-
# ββ
|
| 729 |
confidence_explanation = _build_confidence_explanation(
|
| 730 |
-
ml_label=ml_result.label,
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
high_trust_count=high_trust,
|
| 735 |
-
low_trust_count=low_trust,
|
| 736 |
-
final_prediction=final_prediction,
|
| 737 |
-
final_confidence=final_confidence,
|
| 738 |
-
wiki_verified=wiki_verified,
|
| 739 |
)
|
| 740 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 741 |
elapsed = int((time.time() - t0) * 1000)
|
| 742 |
|
| 743 |
return AnalyzeResponse(
|
| 744 |
-
input_type=input_type,
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
language=language,
|
| 750 |
-
keywords=keywords,
|
| 751 |
-
suspicious=suspicious,
|
| 752 |
-
factors=decision.factors,
|
| 753 |
-
elapsed_ms=elapsed,
|
| 754 |
-
verdict_label=verdict_label,
|
| 755 |
-
case_number=case_number,
|
| 756 |
-
origin_map=origin_map,
|
| 757 |
-
ground_truth=ground_truth,
|
| 758 |
-
# ββ Frontend fields ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 759 |
-
claim=text,
|
| 760 |
-
verdict=frontend_verdict,
|
| 761 |
-
segments=frontend_segments,
|
| 762 |
-
sourceTree=frontend_source_tree,
|
| 763 |
-
connections=frontend_connections,
|
| 764 |
-
groundTruth=ground_truth_string,
|
| 765 |
-
confidenceExplanation=confidence_explanation,
|
| 766 |
)
|
|
|
|
| 8 |
import logging
|
| 9 |
import re
|
| 10 |
import time
|
| 11 |
+
import urllib.parse
|
| 12 |
from contextlib import asynccontextmanager
|
| 13 |
from datetime import datetime, timedelta
|
| 14 |
import random
|
|
|
|
| 22 |
from model import classify, load_model
|
| 23 |
from nlp_utils import build_search_query, detect_language, detect_suspicious_phrases, extract_keywords
|
| 24 |
from scraper import extract_article
|
|
|
|
| 25 |
from decision_engine import make_decision
|
| 26 |
|
| 27 |
+
# ββ π NEW: Import Trust Lists and Models directly from verifier ββ
|
| 28 |
+
from verifier import verify_claim, HIGH_TRUST_DOMAINS, LOW_TRUST_DOMAINS, VerificationResult, SourceArticle
|
| 29 |
+
|
| 30 |
# ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s")
|
| 32 |
logger = logging.getLogger("verilens")
|
|
|
|
| 70 |
snippet: str
|
| 71 |
trust: str
|
| 72 |
|
|
|
|
| 73 |
class OriginNode(BaseModel):
|
|
|
|
| 74 |
id: str
|
| 75 |
+
node_type: str
|
| 76 |
+
source_type: str
|
| 77 |
+
author: str
|
| 78 |
+
timestamp: str
|
| 79 |
+
snippet: str
|
| 80 |
+
url: str
|
| 81 |
|
| 82 |
class MutationConnection(BaseModel):
|
| 83 |
+
from_node: str
|
| 84 |
+
to_node: str
|
| 85 |
+
nli_label: str
|
| 86 |
+
nli_score: int
|
|
|
|
| 87 |
|
| 88 |
class GroundTruthItem(BaseModel):
|
|
|
|
| 89 |
index: int
|
| 90 |
text: str
|
| 91 |
+
badge: str
|
| 92 |
|
| 93 |
class GroundTruthData(BaseModel):
|
| 94 |
+
established_fact: str
|
|
|
|
| 95 |
evidence_items: list[GroundTruthItem]
|
| 96 |
|
| 97 |
class OriginMapData(BaseModel):
|
| 98 |
nodes: list[OriginNode]
|
| 99 |
connections: list[MutationConnection]
|
| 100 |
|
|
|
|
| 101 |
class FrontendAnnotation(BaseModel):
|
| 102 |
type: Literal['contradiction', 'fallacy', 'unverified', 'verified']
|
| 103 |
note: str
|
|
|
|
| 122 |
class FrontendConnection(BaseModel):
|
| 123 |
from_field: str = Field(alias="from", serialization_alias="from")
|
| 124 |
to: str
|
| 125 |
+
nli: dict
|
| 126 |
|
| 127 |
model_config = {"populate_by_name": True}
|
| 128 |
|
|
|
|
| 137 |
suspicious: dict
|
| 138 |
factors: dict
|
| 139 |
elapsed_ms: int
|
| 140 |
+
verdict_label: str
|
| 141 |
+
case_number: str
|
| 142 |
+
origin_map: OriginMapData
|
| 143 |
+
ground_truth: GroundTruthData
|
|
|
|
|
|
|
| 144 |
claim: str
|
| 145 |
verdict: Literal['VERIFIED', 'FABRICATED', 'INCONCLUSIVE']
|
| 146 |
segments: list[FrontendSegment]
|
| 147 |
sourceTree: list[FrontendEvidenceNode]
|
| 148 |
connections: list[FrontendConnection]
|
| 149 |
+
groundTruth: str
|
| 150 |
+
confidenceExplanation: str
|
| 151 |
|
| 152 |
|
| 153 |
# ββ Helpers: build supplementary data from existing signals ββββββββββββββ
|
|
|
|
| 161 |
_NODE_TYPES_AMP = ["SOCIAL MEDIA", "BLOG", "REPOST", "VIRAL TWEET"]
|
| 162 |
|
| 163 |
def _generate_case_number(text: str) -> str:
|
|
|
|
| 164 |
h = hashlib.md5(text.encode()).hexdigest()
|
| 165 |
num = int(h[:6], 16) % 999999
|
| 166 |
return f"TB-{num:06d}"
|
| 167 |
|
| 168 |
def _build_origin_map(sources: list, verification_score: float, text: str) -> OriginMapData:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
nodes: list[OriginNode] = []
|
| 170 |
connections: list[MutationConnection] = []
|
|
|
|
| 171 |
now = datetime.now()
|
| 172 |
+
rng = random.Random(hash(text))
|
| 173 |
|
| 174 |
if not sources:
|
|
|
|
| 175 |
nodes.append(OriginNode(
|
| 176 |
+
id="claim_0", node_type="current_claim", source_type="SUBMITTED CLAIM",
|
| 177 |
+
author="USER SUBMISSION", timestamp=now.strftime("%Y-%m-%d %H:%M"),
|
| 178 |
+
snippet=text[:120] + ("β¦" if len(text) > 120 else ""), url="",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
))
|
| 180 |
return OriginMapData(nodes=nodes, connections=connections)
|
| 181 |
|
| 182 |
+
for i, src in enumerate(sources[:4]):
|
|
|
|
| 183 |
if src.trust == "low":
|
| 184 |
ntype = "hostile_actor"
|
| 185 |
stype = rng.choice(_NODE_TYPES_HOSTILE)
|
|
|
|
| 191 |
else:
|
| 192 |
ntype = "current_claim"
|
| 193 |
stype = "MAJOR NEWS OUTLET"
|
|
|
|
| 194 |
author = src.title.split(" - ")[-1] if " - " in src.title else src.title[:30]
|
| 195 |
|
| 196 |
days_ago = rng.randint(1, 14)
|
|
|
|
| 199 |
ts = (now - timedelta(days=days_ago)).replace(hour=hours, minute=minutes)
|
| 200 |
|
| 201 |
nodes.append(OriginNode(
|
| 202 |
+
id=f"node_{i}", node_type=ntype, source_type=stype,
|
| 203 |
+
author=author, timestamp=ts.strftime("%Y-%m-%d %H:%M"),
|
| 204 |
+
snippet=src.snippet[:150] if src.snippet else src.title, url=src.url,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
))
|
| 206 |
|
|
|
|
| 207 |
for i in range(len(nodes) - 1):
|
|
|
|
| 208 |
score_base = int(verification_score * 100) if verification_score else 50
|
| 209 |
jitter = rng.randint(-15, 15)
|
| 210 |
nli_score = max(10, min(99, score_base + jitter))
|
| 211 |
|
|
|
|
| 212 |
src_trust = sources[i].trust if i < len(sources) else "medium"
|
| 213 |
if src_trust == "low":
|
| 214 |
nli_label = "CONTRADICTION"
|
| 215 |
+
nli_score = max(70, nli_score)
|
| 216 |
elif nli_score >= 60:
|
| 217 |
nli_label = "ENTAILMENT"
|
| 218 |
else:
|
| 219 |
nli_label = "CONTRADICTION"
|
| 220 |
|
| 221 |
connections.append(MutationConnection(
|
| 222 |
+
from_node=nodes[i].id, to_node=nodes[i + 1].id,
|
| 223 |
+
nli_label=nli_label, nli_score=nli_score,
|
|
|
|
|
|
|
| 224 |
))
|
| 225 |
|
| 226 |
return OriginMapData(nodes=nodes, connections=connections)
|
| 227 |
|
| 228 |
|
| 229 |
+
def _build_ground_truth(prediction: str, explanation: str, suspicious: dict, keywords: list[str], sources: list) -> GroundTruthData:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
if prediction == "Fake":
|
| 231 |
+
established_fact = f"Based on cross-referencing {len(sources)} sources and NLI entailment analysis, this claim could not be substantiated. {explanation}"
|
|
|
|
|
|
|
|
|
|
| 232 |
elif prediction == "Real":
|
| 233 |
+
established_fact = f"This claim has been corroborated by {len(sources)} independent sources. {explanation}"
|
|
|
|
|
|
|
| 234 |
else:
|
| 235 |
+
established_fact = f"Verification produced mixed results across {len(sources)} sources. {explanation}"
|
|
|
|
|
|
|
| 236 |
|
|
|
|
| 237 |
items: list[GroundTruthItem] = []
|
| 238 |
idx = 1
|
| 239 |
|
|
|
|
| 253 |
items.append(GroundTruthItem(index=idx, text=f'Unsupported attribution: "{phrase}"', badge="UNVERIFIED"))
|
| 254 |
idx += 1
|
| 255 |
|
|
|
|
| 256 |
high_trust_sources = [s for s in sources if s.trust == "high"]
|
| 257 |
low_trust_sources = [s for s in sources if s.trust == "low"]
|
| 258 |
|
| 259 |
if high_trust_sources:
|
| 260 |
+
items.append(GroundTruthItem(index=idx, text=f"Corroborated by {len(high_trust_sources)} high-trust source(s): {high_trust_sources[0].title[:60]}", badge="CORROBORATED"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
idx += 1
|
| 262 |
|
| 263 |
if low_trust_sources:
|
| 264 |
+
items.append(GroundTruthItem(index=idx, text=f"Found in {len(low_trust_sources)} low-trust source(s) β possible disinformation origin", badge="CONTRADICTION"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
idx += 1
|
| 266 |
|
| 267 |
if not items:
|
| 268 |
+
items.append(GroundTruthItem(index=1, text="No specific evidence markers detected in the text", badge="UNVERIFIED"))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
return GroundTruthData(established_fact=established_fact, evidence_items=items)
|
| 271 |
|
| 272 |
|
| 273 |
+
_SOURCE_LAYOUT_WIKI = (80.0, 20.0, -1)
|
| 274 |
+
_SOURCE_LAYOUT_NEWS = [(20.0, 30.0, -2), (50.0, 80.0, 3), (15.0, 60.0, 1), (60.0, 45.0, -3)]
|
| 275 |
+
|
| 276 |
+
def _build_direct_source_tree(text: str, sources: list, verification_score: float, per_source_scores: list[float] | None = None) -> tuple[list[FrontendEvidenceNode], list[FrontendConnection]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
now = datetime.now()
|
| 278 |
rng = random.Random(hash(text))
|
| 279 |
nodes: list[FrontendEvidenceNode] = []
|
| 280 |
conns: list[FrontendConnection] = []
|
| 281 |
|
|
|
|
| 282 |
claim_node = FrontendEvidenceNode(
|
| 283 |
+
id="claim_0", role="current", type="User Submission", date=now.strftime("%Y-%m-%d %H:%M"),
|
| 284 |
+
author="SUBMITTED CLAIM", content=text[:150] + ("β¦" if len(text) > 150 else ""),
|
| 285 |
+
x=50.0, y=75.0, rotation=2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
)
|
| 287 |
nodes.append(claim_node)
|
| 288 |
|
| 289 |
if not sources:
|
| 290 |
return nodes, conns
|
| 291 |
|
|
|
|
| 292 |
wiki_sources = [s for s in sources if "wikipedia.org" in s.url]
|
| 293 |
news_sources = [s for s in sources if "wikipedia.org" not in s.url]
|
| 294 |
+
ordered: list[tuple] = []
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
for ws in wiki_sources[:1]:
|
| 297 |
x, y, rot = _SOURCE_LAYOUT_WIKI
|
| 298 |
ordered.append((ws, x, y, rot, "Historical Archive"))
|
| 299 |
|
|
|
|
| 300 |
news_idx = 0
|
| 301 |
for ns in news_sources[:3]:
|
| 302 |
x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
|
| 303 |
ordered.append((ns, x, y, rot, "News Article"))
|
| 304 |
news_idx += 1
|
| 305 |
|
|
|
|
| 306 |
if len(ordered) < 3:
|
| 307 |
for ws in wiki_sources[1:3 - len(ordered) + 1]:
|
| 308 |
x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
|
| 309 |
ordered.append((ws, x, y, rot, "Historical Archive"))
|
| 310 |
news_idx += 1
|
| 311 |
|
|
|
|
|
|
|
| 312 |
source_score_map: dict[str, float] = {}
|
| 313 |
if per_source_scores and len(per_source_scores) == len(sources):
|
| 314 |
for s, sc in zip(sources, per_source_scores):
|
| 315 |
source_score_map[s.url] = sc
|
| 316 |
|
| 317 |
for i, (src, x, y, rot, type_label) in enumerate(ordered[:4]):
|
| 318 |
+
role = "hostile" if src.trust == "low" else "amplifier"
|
| 319 |
+
author = src.title.split(" - ")[-1].strip()[:30] if " - " in src.title else ("WIKIPEDIA" if "wikipedia.org" in src.url else (src.title[:30] if src.title else "Unknown Source"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
|
| 321 |
days_ago = rng.randint(1, 14)
|
| 322 |
ts = (now - timedelta(days=days_ago)).strftime("%Y-%m-%d %H:%M")
|
| 323 |
node_id = f"source_{i + 1}"
|
| 324 |
|
| 325 |
nodes.append(FrontendEvidenceNode(
|
| 326 |
+
id=node_id, role=role, type=type_label, date=ts, author=author,
|
| 327 |
+
content=src.snippet[:150] if src.snippet else src.title, x=x, y=y, rotation=rot, url=src.url if src.url else None,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
))
|
| 329 |
|
|
|
|
| 330 |
src_score = source_score_map.get(src.url, verification_score)
|
| 331 |
nli_type = "entailment" if src_score >= 0.65 else "contradiction"
|
| 332 |
nli_score = max(10, min(99, int(src_score * 100)))
|
| 333 |
|
| 334 |
+
conns.append(FrontendConnection(from_field=node_id, to="claim_0", nli={"type": nli_type, "score": nli_score}))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
|
| 336 |
return nodes, conns
|
| 337 |
|
| 338 |
|
| 339 |
def _extract_ground_truth_string(sources: list) -> str:
|
| 340 |
+
if not sources: return "No established fact could be determined from available sources."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
for s in sources:
|
| 342 |
+
if "wikipedia.org" in s.url: return s.snippet[:300] if s.snippet else s.title
|
|
|
|
|
|
|
|
|
|
| 343 |
for s in sources:
|
| 344 |
+
if s.trust == "high" and s.snippet: return s.snippet[:300]
|
|
|
|
|
|
|
|
|
|
| 345 |
for s in sources:
|
| 346 |
+
if s.snippet: return s.snippet[:300]
|
|
|
|
|
|
|
| 347 |
return "No established fact could be determined from available sources."
|
| 348 |
|
| 349 |
|
| 350 |
+
def _build_segments(text: str, suspicious: dict, ground_truth: GroundTruthData, ml_label: str = "", ml_confidence: float = 0.0) -> list[FrontendSegment]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
segments: list[FrontendSegment] = []
|
|
|
|
|
|
|
| 352 |
if ml_label:
|
| 353 |
ml_label_display = ml_label.upper()
|
| 354 |
ml_pct = int(ml_confidence * 100)
|
| 355 |
+
if ml_label_display == "FAKE": ml_note = f"The local NLP model analyzed the linguistic syntax and scored this claim at {ml_pct}% FAKE due to sensationalist phrasing, emotional manipulation, or patterns consistent with disinformation."
|
| 356 |
+
elif ml_label_display == "REAL": ml_note = f"The local NLP model analyzed the linguistic syntax and scored this claim at {ml_pct}% REAL β professional journalistic tone detected with minimal sensationalist markers."
|
| 357 |
+
else: ml_note = f"The local NLP model analyzed the linguistic syntax but could not reach a definitive conclusion (confidence: {ml_pct}%). The text contains a mix of professional and informal language patterns."
|
| 358 |
+
segments.append(FrontendSegment(text=f"[LINGUISTIC ANALYSIS] ", isSuspicious=True, annotation=FrontendAnnotation(type="unverified", note=ml_note)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
+
evidence_annotations: list[tuple[str, str]] = [(item.badge, item.text) for item in ground_truth.evidence_items]
|
| 361 |
sus_phrases: list[str] = []
|
| 362 |
for key in ["clickbait_phrases", "emotional_language", "unsupported_claims"]:
|
| 363 |
sus_phrases.extend(suspicious.get(key, []))
|
|
|
|
| 368 |
segments.append(FrontendSegment(text=text, isSuspicious=False))
|
| 369 |
return segments
|
| 370 |
|
| 371 |
+
badge_to_annotation_type = {"FALLACY": "fallacy", "UNVERIFIED": "unverified", "CONTRADICTION": "contradiction", "CORROBORATED": "verified"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
evidence_idx = 0
|
| 373 |
|
| 374 |
for sentence in sentences:
|
| 375 |
sentence_text = sentence.strip()
|
| 376 |
+
if not sentence_text: continue
|
| 377 |
+
if not sentence_text.endswith(" "): sentence_text += " "
|
|
|
|
|
|
|
|
|
|
| 378 |
is_sus = any(phrase.lower() in sentence_text.lower() for phrase in sus_phrases)
|
| 379 |
+
if not is_sus and evidence_idx < len(evidence_annotations) and len(sentences) <= 5: is_sus = True
|
|
|
|
|
|
|
| 380 |
|
| 381 |
annotation = None
|
| 382 |
if is_sus and evidence_idx < len(evidence_annotations):
|
|
|
|
| 385 |
annotation = FrontendAnnotation(type=ann_type, note=note)
|
| 386 |
evidence_idx += 1
|
| 387 |
|
| 388 |
+
segments.append(FrontendSegment(text=sentence_text, isSuspicious=is_sus and annotation is not None, annotation=annotation))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
| 390 |
return segments
|
| 391 |
|
| 392 |
|
| 393 |
+
def _build_confidence_explanation(ml_label: str, ml_confidence: float, similarity_score: float, num_sources: int, high_trust_count: int, low_trust_count: int, final_prediction: str, final_confidence: int, wiki_verified: bool) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
parts: list[str] = []
|
|
|
|
|
|
|
| 395 |
ml_pct = int(ml_confidence * 100)
|
| 396 |
+
parts.append(f"STEP 1 β LINGUISTIC ANALYSIS: The local DistilBERT NLP model classified the text as {ml_label.upper()} with {ml_pct}% internal confidence after analyzing syntax patterns, sensationalist markers, and journalistic tone indicators.")
|
| 397 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
sim_pct = int(similarity_score * 100)
|
| 399 |
threshold_met = "PASSED" if similarity_score >= 0.65 else "FAILED"
|
| 400 |
+
parts.append(f"STEP 2 β CROSS-ENCODER VERIFICATION: A live internet scan retrieved {num_sources} source(s). The Cross-Encoder semantic similarity scored {sim_pct}% against the 65% entailment threshold ({threshold_met}). {'Wikipedia independently corroborated the claim.' if wiki_verified else 'No Wikipedia corroboration was found.'}")
|
| 401 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
medium_trust = num_sources - high_trust_count - low_trust_count
|
| 403 |
+
parts.append(f"STEP 3 β SOURCE TRUST AUDIT: Of {num_sources} sources, {high_trust_count} rated HIGH trust, {medium_trust} rated MEDIUM, and {low_trust_count} rated LOW. {'A strong evidence base supports this verdict.' if high_trust_count >= 2 else 'The evidence base is limited, which affects overall confidence.'}")
|
| 404 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
guardrails: list[str] = []
|
| 406 |
+
if num_sources == 0: guardrails.append("ZERO-EVIDENCE PENALTY (no sources found, verdict forced to FABRICATED)")
|
| 407 |
+
if final_prediction == "Uncertain" and similarity_score < 0.78 and not wiki_verified: guardrails.append("MUDDY WATERS GUARDRAIL (weak corroboration, verdict shifted to INCONCLUSIVE)")
|
|
|
|
|
|
|
| 408 |
|
| 409 |
+
if guardrails: parts.append(f"STEP 4 β GUARDRAILS TRIGGERED: {'; '.join(guardrails)}.")
|
| 410 |
+
else: parts.append("STEP 4 β GUARDRAILS: No safety overrides were triggered. The verdict reflects the raw analysis.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
+
parts.append(f"FINAL SYNTHESIS: Combining the ML model's {ml_label.upper()} signal, the {sim_pct}% semantic match, and {num_sources} source(s), the system arrived at a final confidence of {final_confidence}%.")
|
| 413 |
return " βΈ ".join(parts)
|
| 414 |
|
| 415 |
|
|
|
|
| 425 |
raise HTTPException(status_code=400, detail="Input cannot be empty.")
|
| 426 |
|
| 427 |
t0 = time.time()
|
| 428 |
+
|
| 429 |
+
# ββ π FAST-PATH ROUTER VARIABLES ββ
|
| 430 |
+
fast_path_trust = None
|
| 431 |
+
domain = ""
|
| 432 |
|
| 433 |
if _is_url(raw):
|
| 434 |
input_type = "URL"
|
| 435 |
+
parsed_url = urllib.parse.urlparse(raw)
|
| 436 |
+
domain = parsed_url.netloc.lower()
|
| 437 |
+
if domain.startswith("www."):
|
| 438 |
+
domain = domain[4:]
|
| 439 |
+
|
| 440 |
+
# π THE GATEKEEPER: Check domain reputation instantly
|
| 441 |
+
if domain in HIGH_TRUST_DOMAINS:
|
| 442 |
+
fast_path_trust = "high"
|
| 443 |
+
elif domain in LOW_TRUST_DOMAINS:
|
| 444 |
+
fast_path_trust = "low"
|
| 445 |
+
|
| 446 |
+
# Extract the slug (just for UI display purposes)
|
| 447 |
+
path_parts = [p for p in parsed_url.path.split('/') if p]
|
| 448 |
+
valid_slug = ""
|
| 449 |
+
for part in reversed(path_parts):
|
| 450 |
+
candidate = part.replace('-', ' ').replace('_', ' ')
|
| 451 |
+
candidate = re.sub(r'\.[a-z0-9]+$', '', candidate, flags=re.IGNORECASE)
|
| 452 |
+
candidate = re.sub(r'\s\d{4}\s\d{2}\s\d{2}$', '', candidate).strip()
|
| 453 |
+
candidate = re.sub(r'\s\d+$', '', candidate).strip()
|
| 454 |
+
if len(candidate.split()) >= 3:
|
| 455 |
+
valid_slug = candidate
|
| 456 |
+
break
|
| 457 |
+
|
| 458 |
+
text = valid_slug if valid_slug else raw
|
| 459 |
+
|
| 460 |
+
# If not a known trusted domain, proceed with normal AI scraping
|
| 461 |
+
if not fast_path_trust:
|
| 462 |
+
try:
|
| 463 |
+
article = extract_article(raw)
|
| 464 |
+
if not article or not article.text or len(article.text.strip()) < 10:
|
| 465 |
+
raise ValueError("Empty response or blocked by anti-bot.")
|
| 466 |
+
text = f"{article.title}. {article.text}"
|
| 467 |
+
except Exception as exc:
|
| 468 |
+
logger.warning(f"Scraping blocked or failed: {exc}. Relying on slug.")
|
| 469 |
+
if not valid_slug:
|
| 470 |
+
raise HTTPException(
|
| 471 |
+
status_code=400,
|
| 472 |
+
detail="This news site actively blocks AI scrapers, and the link does not contain a readable headline. Please copy and paste the actual text of the article into the box instead."
|
| 473 |
+
)
|
| 474 |
else:
|
| 475 |
input_type = "TEXT"
|
| 476 |
text = raw
|
|
|
|
| 480 |
suspicious = detect_suspicious_phrases(text)
|
| 481 |
search_query = build_search_query(text)
|
| 482 |
|
| 483 |
+
# ββ π EXECUTE FAST PATH OR NORMAL AI PATH ββ
|
| 484 |
+
if fast_path_trust == "high":
|
| 485 |
+
logger.info(f"β‘ FAST-PATH TRIGGERED: High Trust Domain ({domain})")
|
| 486 |
+
|
| 487 |
+
class DummyML:
|
| 488 |
+
label = "Real"
|
| 489 |
+
confidence = 0.99
|
| 490 |
+
ml_result = DummyML()
|
| 491 |
+
|
| 492 |
+
# Build a mock source to populate the Evidence Board
|
| 493 |
+
src = SourceArticle(title=f"Official Verified Publisher: {domain.upper()}", url=raw, snippet=f"Direct link to official verified publisher ({domain}). Content from this source is inherently trusted.", trust="high")
|
| 494 |
+
src.stance = "entailment"
|
| 495 |
+
src.score = 0.99
|
| 496 |
+
|
| 497 |
+
verification = VerificationResult(
|
| 498 |
+
similarity_score=0.99, sources=[src], verified=True,
|
| 499 |
+
max_entailment=0.99
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
elif fast_path_trust == "low":
|
| 503 |
+
logger.info(f"β‘ FAST-PATH TRIGGERED: Low Trust Domain ({domain})")
|
| 504 |
+
|
| 505 |
+
class DummyML:
|
| 506 |
+
label = "Fake"
|
| 507 |
+
confidence = 0.99
|
| 508 |
+
ml_result = DummyML()
|
| 509 |
+
|
| 510 |
+
src = SourceArticle(title=f"Flagged Domain: {domain.upper()}", url=raw, snippet=f"Domain is flagged in the Truth Bureau database as a known source of misinformation, propaganda, or satire.", trust="low")
|
| 511 |
+
src.stance = "contradiction"
|
| 512 |
+
src.score = 0.99
|
| 513 |
+
|
| 514 |
+
verification = VerificationResult(
|
| 515 |
+
similarity_score=0.05, sources=[src], verified=False,
|
| 516 |
+
max_entailment=0.05
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
else:
|
| 520 |
+
# NORMAL AI EXECUTION
|
| 521 |
+
ml_result = classify(text)
|
| 522 |
+
verification = await verify_claim(text, search_query)
|
| 523 |
|
| 524 |
high_trust = sum(1 for s in verification.sources if s.trust == "high")
|
| 525 |
low_trust = sum(1 for s in verification.sources if s.trust == "low")
|
| 526 |
|
|
|
|
| 527 |
decision = make_decision(
|
| 528 |
+
ml_label=ml_result.label, ml_confidence=ml_result.confidence,
|
| 529 |
+
similarity_score=verification.similarity_score, sources_verified=verification.verified,
|
| 530 |
+
suspicious_info=suspicious, high_trust_count=high_trust, low_trust_count=low_trust,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
)
|
| 532 |
|
| 533 |
+
final_prediction = str(decision.prediction).title()
|
| 534 |
final_confidence = int(decision.confidence)
|
| 535 |
final_explanation = str(decision.explanation)
|
|
|
|
| 536 |
wiki_verified = any("wikipedia.org" in s.url for s in verification.sources)
|
| 537 |
|
| 538 |
+
# ββ π OVERRIDE FINAL EXPLANATIONS FOR FAST-PATH ββ
|
| 539 |
+
if fast_path_trust == "high":
|
| 540 |
+
final_prediction = "Real"
|
| 541 |
+
final_confidence = 99
|
| 542 |
+
final_explanation = f"FAST-PATH VERIFICATION: The submitted URL directly matches '{domain}', which is listed in the Truth Bureau database as a highly trusted official source. No secondary AI corroboration was necessary."
|
| 543 |
+
elif fast_path_trust == "low":
|
| 544 |
final_prediction = "Fake"
|
| 545 |
+
final_confidence = 5
|
| 546 |
+
final_explanation = f"FAST-PATH REJECTION: The submitted URL matches '{domain}', a domain heavily flagged in our database for misinformation, hostile propaganda, or satire."
|
| 547 |
+
else:
|
| 548 |
+
# Normal Guardrails only apply if not fast-path
|
| 549 |
+
if final_prediction in ["Real", "Uncertain"] and len(verification.sources) == 0:
|
| 550 |
+
final_prediction = "Fake"
|
| 551 |
+
final_confidence = 10
|
| 552 |
+
final_explanation = "The AI text analysis found no sensationalism, but a live internet scan found ZERO evidence to support this claim. In journalism, a total lack of corroboration for a statement indicates it is unverified or FAKE."
|
| 553 |
+
elif final_prediction == "Real" and verification.similarity_score < 0.78 and not wiki_verified:
|
| 554 |
+
final_prediction = "Uncertain"
|
| 555 |
+
final_confidence = 50
|
| 556 |
+
final_explanation = "The AI detected a professional journalistic tone, and related topics were found online. However, the EXACT claim could not be highly corroborated by the Cross-Encoder. This may be a misleading mix of real entities and fake events."
|
| 557 |
+
|
| 558 |
+
source_outs = [SourceOut(title=s.title, url=s.url, snippet=s.snippet, trust=s.trust) for s in verification.sources]
|
|
|
|
|
|
|
| 559 |
verdict_label = _VERDICT_MAP.get(final_prediction, "UNDER REVIEW")
|
| 560 |
case_number = _generate_case_number(text)
|
| 561 |
origin_map = _build_origin_map(verification.sources, verification.similarity_score, text)
|
| 562 |
+
ground_truth = _build_ground_truth(final_prediction, final_explanation, suspicious, keywords, verification.sources)
|
|
|
|
|
|
|
| 563 |
|
|
|
|
| 564 |
frontend_verdict = _FRONTEND_VERDICT_MAP.get(final_prediction, "INCONCLUSIVE")
|
| 565 |
+
frontend_source_tree, frontend_connections = _build_direct_source_tree(text, verification.sources, verification.similarity_score)
|
| 566 |
+
frontend_segments = _build_segments(text, suspicious, ground_truth, ml_label=ml_result.label, ml_confidence=ml_result.confidence)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
ground_truth_string = _extract_ground_truth_string(verification.sources)
|
| 568 |
|
| 569 |
+
# ββ π OVERRIDE UI EXPLANATION BOX FOR FAST-PATH ββ
|
| 570 |
confidence_explanation = _build_confidence_explanation(
|
| 571 |
+
ml_label=ml_result.label, ml_confidence=ml_result.confidence,
|
| 572 |
+
similarity_score=verification.similarity_score, num_sources=len(verification.sources),
|
| 573 |
+
high_trust_count=high_trust, low_trust_count=low_trust,
|
| 574 |
+
final_prediction=final_prediction, final_confidence=final_confidence, wiki_verified=wiki_verified,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 575 |
)
|
| 576 |
|
| 577 |
+
if fast_path_trust == "high":
|
| 578 |
+
confidence_explanation = f"STEP 1 β DOMAIN REPUTATION: The URL was instantly recognized as an official High-Trust publisher ({domain}). βΈ STEP 2 β FAST-PATH ROUTING: Secondary Google News scanning and NLI checking were bypassed to save compute. βΈ FINAL SYNTHESIS: The source is inherently trusted. Final confidence is 99%."
|
| 579 |
+
elif fast_path_trust == "low":
|
| 580 |
+
confidence_explanation = f"STEP 1 β DOMAIN REPUTATION: The URL was instantly matched against our blacklist ({domain}). βΈ STEP 2 β FAST-PATH ROUTING: The domain is known for fabricating information. βΈ FINAL SYNTHESIS: Claim rejected due to source origin. Final confidence is 5%."
|
| 581 |
+
|
| 582 |
elapsed = int((time.time() - t0) * 1000)
|
| 583 |
|
| 584 |
return AnalyzeResponse(
|
| 585 |
+
input_type=input_type, prediction=final_prediction, confidence=final_confidence, explanation=final_explanation,
|
| 586 |
+
sources=source_outs, language=language, keywords=keywords, suspicious=suspicious, factors=decision.factors, elapsed_ms=elapsed,
|
| 587 |
+
verdict_label=verdict_label, case_number=case_number, origin_map=origin_map, ground_truth=ground_truth,
|
| 588 |
+
claim=text, verdict=frontend_verdict, segments=frontend_segments, sourceTree=frontend_source_tree,
|
| 589 |
+
connections=frontend_connections, groundTruth=ground_truth_string, confidenceExplanation=confidence_explanation,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 590 |
)
|