Spaces:

nahArnav
/

thetruthbureau

Sleeping

App Files Files Community

nahArnav commited on Mar 25

Commit

39bbca0

verified ·

1 Parent(s): f791e77

Upload 13 files

Browse files

Files changed (13) hide show

decision_engine.py +130 -0
main.py +766 -0
model.py +141 -0
nlp_utils.py +236 -0
requirements.txt +24 -0
scraper.py +77 -0
trained_model_v2/config.json +33 -0
trained_model_v2/model.safetensors +3 -0
trained_model_v2/special_tokens_map.json +7 -0
trained_model_v2/tokenizer.json +0 -0
trained_model_v2/tokenizer_config.json +55 -0
trained_model_v2/vocab.txt +0 -0
verifier.py +422 -0

decision_engine.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Decision Engine for VeriLens AI
+Combines ML prediction, verification similarity, source credibility,
+and NLP analysis into a final verdict.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+@dataclass
+class Decision:
+    prediction: str        # "REAL", "FAKE", or "UNCERTAIN"
+    confidence: int        # 0 – 100
+    explanation: str
+    factors: dict = field(default_factory=dict)
+def make_decision(
+    ml_label: str,
+    ml_confidence: float,
+    similarity_score: float,
+    sources_verified: bool,
+    suspicious_info: dict,
+    high_trust_count: int = 0,
+    low_trust_count: int = 0,
+) -> Decision:
+    """Weighted decision combining multiple signals."""
+    # ── ML score contribution (0-45) ────────────────────────────────────────
+    if ml_label == "FAKE":
+        ml_score = (1 - ml_confidence) * 45
+    elif ml_label == "REAL":
+        ml_score = ml_confidence * 45
+    else:
+        ml_score = 22.5
+    # ── Verification score contribution (0-25) ──────────────────────────────
+    if sources_verified:
+        verify_score = similarity_score * 25
+    else:
+        verify_score = 12.5
+    # ── Source credibility contribution (0-15) ──────────────────────────────
+    if high_trust_count + low_trust_count > 0:
+        cred_ratio = high_trust_count / (high_trust_count + low_trust_count)
+        cred_score = cred_ratio * 15
+    elif sources_verified:
+        cred_score = 7.5
+    else:
+        cred_score = 7.5
+    # ── Suspicious language penalty (0-15) ──────────────────────────────────
+    sus_count = suspicious_info.get("total_suspicious_count", 0)
+    if sus_count == 0:
+        sus_score = 15
+    elif sus_count <= 2:
+        sus_score = 10
+    elif sus_count <= 5:
+        sus_score = 5
+    else:
+        sus_score = 0
+    # ── Aggregate ───────────────────────────────────────────────────────────
+    total = ml_score + verify_score + cred_score + sus_score
+    total = max(0, min(100, total))
+    # ── Guard: prevent FAKE ML prediction from flipping to Real ─────────
+    ml_fake_overridden = False
+    if ml_label == "FAKE" and ml_confidence >= 0.6 and total >= 65:
+        total = 55
+        ml_fake_overridden = True
+    # ── Decide verdict (STANDARDIZED TO UPPERCASE) ──────────────────────
+    if total >= 65:
+        prediction = "REAL"
+    elif total <= 40:
+        prediction = "FAKE"
+    else:
+        prediction = "UNCERTAIN"
+    # ── Confidence relative to the prediction ───────────────────────────
+    if prediction == "REAL":
+        confidence = int(round(total))
+    elif prediction == "FAKE":
+        confidence = 100 - int(round(total))
+    else:
+        distance = abs(total - 52.5)
+        confidence = max(30, min(50, int(round(50 - distance))))
+    # ── Build explanation ───────────────────────────────────────────────────
+    explanations: list[str] = []
+    if ml_label == "FAKE":
+        explanations.append(f"The AI model classified this as FAKE with {ml_confidence:.0%} confidence.")
+        if ml_fake_overridden:
+            explanations.append("Although related articles exist online, they may be debunking the claim rather than confirming it.")
+    elif ml_label == "REAL":
+        explanations.append(f"The AI model classified this as REAL with {ml_confidence:.0%} confidence.")
+    else:
+        explanations.append("The AI model could not reach a strong conclusion.")
+    if sources_verified:
+        if similarity_score > 0.6:
+            explanations.append("The claim is well-corroborated by multiple online sources.")
+        elif similarity_score > 0.3:
+            explanations.append("Some related articles were found, but corroboration is partial.")
+        else:
+            explanations.append("Very few matching sources were found online.")
+    else:
+        explanations.append("Internet verification was not available; the verdict relies on AI analysis.")
+    if sus_count > 3:
+        explanations.append("High levels of suspicious, sensationalist, or emotional language detected.")
+    elif sus_count > 0:
+        explanations.append("Minor suspicious language patterns were noted.")
+    explanation = " ".join(explanations)
+    factors = {
+        "ml_score": round(ml_score, 2),
+        "verification_score": round(verify_score, 2),
+        "credibility_score": round(cred_score, 2),
+        "language_score": round(sus_score, 2),
+    }
+    return Decision(
+        prediction=prediction,
+        confidence=confidence,
+        explanation=explanation,
+        factors=factors,
+    )

main.py ADDED Viewed

	@@ -0,0 +1,766 @@

+"""
+VeriLens AI – FastAPI Backend
+Main application entry point.
+"""
+from __future__ import annotations
+import hashlib
+import logging
+import re
+import time
+from contextlib import asynccontextmanager
+from datetime import datetime, timedelta
+import random
+from typing import Literal, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from model import classify, load_model
+from nlp_utils import build_search_query, detect_language, detect_suspicious_phrases, extract_keywords
+from scraper import extract_article
+from verifier import verify_claim
+from decision_engine import make_decision
+# ── Logging ─────────────────────────────────────────────────────────────────
+logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s")
+logger = logging.getLogger("verilens")
+URL_PATTERN = re.compile(r"^https?://(?:[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%])+")
+def _is_url(text: str) -> bool:
+    return bool(URL_PATTERN.match(text.strip()))
+# ── Lifespan ────────────────────────────────────────────────────────────────
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    import threading
+    logger.info("Starting VeriLens AI backend …")
+    threading.Thread(target=load_model, daemon=True).start()
+    yield
+    logger.info("Shutting down VeriLens AI backend.")
+# ── FastAPI app ─────────────────────────────────────────────────────────────
+app = FastAPI(title="VeriLens AI", description="Hybrid Fake News Detection System", version="1.0.0", lifespan=lifespan)
+@app.get("/")
+def health_check():
+    return {"status": "Truth Bureau Backend is Alive and Running"}
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Schemas ──────────────────────────────────────────────────────────────
+class AnalyzeRequest(BaseModel):
+    input: str
+class SourceOut(BaseModel):
+    title: str
+    url: str
+    snippet: str
+    trust: str
+# ── NEW: Origin & Mutation Map schemas ───────────────────────────────────
+class OriginNode(BaseModel):
+    """A node on the Origin & Mutation Map (newspaper clipping)."""
+    id: str
+    node_type: str          # "hostile_actor" | "amplifier" | "current_claim"
+    source_type: str        # "FORUM POST", "SOCIAL MEDIA", "MAJOR NEWS OUTLET", etc.
+    author: str             # "ANON_USER44", "@HEALTHGURU_99", outlet name
+    timestamp: str          # ISO-ish date string
+    snippet: str            # The text on the clipping
+    url: str                # Link to examine source
+class MutationConnection(BaseModel):
+    """A dotted line between two nodes with an NLI badge."""
+    from_node: str          # id of source node
+    to_node: str            # id of target node
+    nli_label: str          # "ENTAILMENT" | "CONTRADICTION"
+    nli_score: int          # percentage, e.g. 98
+class GroundTruthItem(BaseModel):
+    """One item in the evidence analysis list."""
+    index: int
+    text: str
+    badge: str              # "UNVERIFIED" | "CONTRADICTION" | "FALLACY" | "CORROBORATED"
+class GroundTruthData(BaseModel):
+    """The Established Fact + Evidence Analysis panel."""
+    established_fact: str   # The corrective summary
+    evidence_items: list[GroundTruthItem]
+class OriginMapData(BaseModel):
+    nodes: list[OriginNode]
+    connections: list[MutationConnection]
+# ── NEW: Frontend-compatible schemas (matches React sampleAnalysis) ──────
+class FrontendAnnotation(BaseModel):
+    type: Literal['contradiction', 'fallacy', 'unverified', 'verified']
+    note: str
+class FrontendSegment(BaseModel):
+    text: str
+    isSuspicious: bool
+    annotation: Optional[FrontendAnnotation] = None
+class FrontendEvidenceNode(BaseModel):
+    id: str
+    role: Literal['hostile', 'amplifier', 'current']
+    type: str
+    date: str
+    author: str
+    content: str
+    x: float
+    y: float
+    rotation: float
+    url: Optional[str] = None
+class FrontendConnection(BaseModel):
+    from_field: str = Field(alias="from", serialization_alias="from")
+    to: str
+    nli: dict  # {"type": "contradiction" | "entailment", "score": int}
+    model_config = {"populate_by_name": True}
+class AnalyzeResponse(BaseModel):
+    input_type: str
+    prediction: str
+    confidence: int
+    explanation: str
+    sources: list[SourceOut]
+    language: str
+    keywords: list[str]
+    suspicious: dict
+    factors: dict
+    elapsed_ms: int
+    # ── Figma dashboard fields ───────────────────────────────────────────
+    verdict_label: str              # "FABRICATED" | "VERIFIED" | "UNDER REVIEW"
+    case_number: str                # e.g. "TB-006753"
+    origin_map: OriginMapData       # structured node + connection data
+    ground_truth: GroundTruthData   # established fact + evidence items
+    # ── Frontend-compatible fields (React components) ────────────────────
+    claim: str
+    verdict: Literal['VERIFIED', 'FABRICATED', 'INCONCLUSIVE']
+    segments: list[FrontendSegment]
+    sourceTree: list[FrontendEvidenceNode]
+    connections: list[FrontendConnection]
+    groundTruth: str                # Dynamic established fact string for the UI
+    confidenceExplanation: str      # Detailed analytical breakdown of the confidence score
+# ── Helpers: build supplementary data from existing signals ──────────────
+_VERDICT_MAP = {"Fake": "FABRICATED", "Real": "VERIFIED", "Uncertain": "UNDER REVIEW"}
+_FRONTEND_VERDICT_MAP = {"Fake": "FABRICATED", "Real": "VERIFIED", "Uncertain": "INCONCLUSIVE"}
+_NODE_AUTHORS = ["ANON_USER44", "@HEALTHGURU_99", "@NEWS_WATCHER", "@VIRAL_POST",
+                 "UNKNOWN_SOURCE", "@FACTCHECK_BOT", "@INFO_SPREADER"]
+_NODE_TYPES_HOSTILE = ["FORUM POST", "ANONYMOUS TIP", "CHAN BOARD", "DARK WEB POST"]
+_NODE_TYPES_AMP    = ["SOCIAL MEDIA", "BLOG", "REPOST", "VIRAL TWEET"]
+def _generate_case_number(text: str) -> str:
+    """Deterministic case number from input hash."""
+    h = hashlib.md5(text.encode()).hexdigest()
+    num = int(h[:6], 16) % 999999
+    return f"TB-{num:06d}"
+def _build_origin_map(sources: list, verification_score: float, text: str) -> OriginMapData:
+    """
+    Build the Origin & Mutation Map from existing source data.
+    Maps sources into Hostile Actor / Amplifier / Current Claim nodes
+    and creates NLI connections between them.
+    """
+    nodes: list[OriginNode] = []
+    connections: list[MutationConnection] = []
+    now = datetime.now()
+    rng = random.Random(hash(text))  # deterministic per-claim randomness
+    if not sources:
+        # Even with no sources, show the current claim node
+        nodes.append(OriginNode(
+            id="claim_0",
+            node_type="current_claim",
+            source_type="SUBMITTED CLAIM",
+            author="USER SUBMISSION",
+            timestamp=now.strftime("%Y-%m-%d %H:%M"),
+            snippet=text[:120] + ("…" if len(text) > 120 else ""),
+            url="",
+        ))
+        return OriginMapData(nodes=nodes, connections=connections)
+    # Categorize sources into node types based on trust level
+    for i, src in enumerate(sources[:4]):  # max 4 nodes on the map
+        if src.trust == "low":
+            ntype = "hostile_actor"
+            stype = rng.choice(_NODE_TYPES_HOSTILE)
+            author = rng.choice(_NODE_AUTHORS[:3])
+        elif src.trust == "medium":
+            ntype = "amplifier"
+            stype = rng.choice(_NODE_TYPES_AMP)
+            author = rng.choice(_NODE_AUTHORS[3:])
+        else:
+            ntype = "current_claim"
+            stype = "MAJOR NEWS OUTLET"
+            # Extract outlet name from title
+            author = src.title.split(" - ")[-1] if " - " in src.title else src.title[:30]
+        days_ago = rng.randint(1, 14)
+        hours = rng.randint(0, 23)
+        minutes = rng.randint(0, 59)
+        ts = (now - timedelta(days=days_ago)).replace(hour=hours, minute=minutes)
+        nodes.append(OriginNode(
+            id=f"node_{i}",
+            node_type=ntype,
+            source_type=stype,
+            author=author,
+            timestamp=ts.strftime("%Y-%m-%d %H:%M"),
+            snippet=src.snippet[:150] if src.snippet else src.title,
+            url=src.url,
+        ))
+    # Create connections between sequential nodes with NLI scores
+    for i in range(len(nodes) - 1):
+        # Derive NLI label from verification score + source trust
+        score_base = int(verification_score * 100) if verification_score else 50
+        jitter = rng.randint(-15, 15)
+        nli_score = max(10, min(99, score_base + jitter))
+        # High scores on high-trust = ENTAILMENT, low trust = CONTRADICTION
+        src_trust = sources[i].trust if i < len(sources) else "medium"
+        if src_trust == "low":
+            nli_label = "CONTRADICTION"
+            nli_score = max(70, nli_score)  # hostile actors get high contradiction
+        elif nli_score >= 60:
+            nli_label = "ENTAILMENT"
+        else:
+            nli_label = "CONTRADICTION"
+        connections.append(MutationConnection(
+            from_node=nodes[i].id,
+            to_node=nodes[i + 1].id,
+            nli_label=nli_label,
+            nli_score=nli_score,
+        ))
+    return OriginMapData(nodes=nodes, connections=connections)
+def _build_ground_truth(
+    prediction: str,
+    explanation: str,
+    suspicious: dict,
+    keywords: list[str],
+    sources: list,
+) -> GroundTruthData:
+    """Build the Established Fact + Evidence Analysis from existing signals."""
+    # The established fact is derived from the AI explanation
+    if prediction == "Fake":
+        established_fact = (
+            f"Based on cross-referencing {len(sources)} sources and NLI entailment analysis, "
+            f"this claim could not be substantiated. {explanation}"
+        )
+    elif prediction == "Real":
+        established_fact = (
+            f"This claim has been corroborated by {len(sources)} independent sources. {explanation}"
+        )
+    else:
+        established_fact = (
+            f"Verification produced mixed results across {len(sources)} sources. {explanation}"
+        )
+    # Build evidence items from suspicious phrases + source data
+    items: list[GroundTruthItem] = []
+    idx = 1
+    clickbait = suspicious.get("clickbait_phrases", [])
+    emotional = suspicious.get("emotional_language", [])
+    unsupported = suspicious.get("unsupported_claims", [])
+    for phrase in clickbait[:2]:
+        items.append(GroundTruthItem(index=idx, text=f'Clickbait language detected: "{phrase}"', badge="FALLACY"))
+        idx += 1
+    for phrase in emotional[:2]:
+        items.append(GroundTruthItem(index=idx, text=f'Emotional manipulation: "{phrase}"', badge="FALLACY"))
+        idx += 1
+    for phrase in unsupported[:2]:
+        items.append(GroundTruthItem(index=idx, text=f'Unsupported attribution: "{phrase}"', badge="UNVERIFIED"))
+        idx += 1
+    # Add source-based evidence
+    high_trust_sources = [s for s in sources if s.trust == "high"]
+    low_trust_sources = [s for s in sources if s.trust == "low"]
+    if high_trust_sources:
+        items.append(GroundTruthItem(
+            index=idx,
+            text=f"Corroborated by {len(high_trust_sources)} high-trust source(s): {high_trust_sources[0].title[:60]}",
+            badge="CORROBORATED",
+        ))
+        idx += 1
+    if low_trust_sources:
+        items.append(GroundTruthItem(
+            index=idx,
+            text=f"Found in {len(low_trust_sources)} low-trust source(s) — possible disinformation origin",
+            badge="CONTRADICTION",
+        ))
+        idx += 1
+    if not items:
+        items.append(GroundTruthItem(
+            index=1,
+            text="No specific evidence markers detected in the text",
+            badge="UNVERIFIED",
+        ))
+    return GroundTruthData(established_fact=established_fact, evidence_items=items)
+# ── Helpers: build frontend-compatible structures ────────────────────────
+# Layout presets for source nodes: (x, y, rotation) — diverse spread
+_SOURCE_LAYOUT_WIKI = (80.0, 20.0, -1)       # Top-right for Wikipedia
+_SOURCE_LAYOUT_NEWS = [
+    (20.0, 30.0, -2),
+    (50.0, 80.0, 3),
+    (15.0, 60.0, 1),
+    (60.0, 45.0, -3),
+]
+def _build_direct_source_tree(
+    text: str,
+    sources: list,
+    verification_score: float,
+    per_source_scores: list[float] | None = None,
+) -> tuple[list[FrontendEvidenceNode], list[FrontendConnection]]:
+    """
+    Build the Evidence Board directly from verification sources.
+    Ensures a diverse mix of Wikipedia (historical) + news sources.
+    Always produces ≥1 node (the claim). With sources → ≥3 nodes.
+    Returns (sourceTree, connections).
+    """
+    now = datetime.now()
+    rng = random.Random(hash(text))
+    nodes: list[FrontendEvidenceNode] = []
+    conns: list[FrontendConnection] = []
+    # ── Node 1: The Claim (always present) ───────────────────────────────
+    claim_node = FrontendEvidenceNode(
+        id="claim_0",
+        role="current",
+        type="User Submission",
+        date=now.strftime("%Y-%m-%d %H:%M"),
+        author="SUBMITTED CLAIM",
+        content=text[:150] + ("…" if len(text) > 150 else ""),
+        x=50.0,
+        y=75.0,
+        rotation=2,
+    )
+    nodes.append(claim_node)
+    if not sources:
+        return nodes, conns
+    # ── Separate Wikipedia (historical) from news sources ────────────────
+    wiki_sources = [s for s in sources if "wikipedia.org" in s.url]
+    news_sources = [s for s in sources if "wikipedia.org" not in s.url]
+    # Build ordered list: Wikipedia first, then news, ensuring rich diversity
+    ordered: list[tuple] = []  # (source, layout_x, layout_y, layout_rot, source_type_label)
+    # Always include Wikipedia if available
+    for ws in wiki_sources[:1]:
+        x, y, rot = _SOURCE_LAYOUT_WIKI
+        ordered.append((ws, x, y, rot, "Historical Archive"))
+    # Always include at least 2 news articles
+    news_idx = 0
+    for ns in news_sources[:3]:
+        x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
+        ordered.append((ns, x, y, rot, "News Article"))
+        news_idx += 1
+    # If we still have < 3 sources, fill with remaining Wikipedia
+    if len(ordered) < 3:
+        for ws in wiki_sources[1:3 - len(ordered) + 1]:
+            x, y, rot = _SOURCE_LAYOUT_NEWS[news_idx % len(_SOURCE_LAYOUT_NEWS)]
+            ordered.append((ws, x, y, rot, "Historical Archive"))
+            news_idx += 1
+    # ── Build nodes + connections for each source ────────────────────────
+    # Build a score lookup for per-source NLI
+    source_score_map: dict[str, float] = {}
+    if per_source_scores and len(per_source_scores) == len(sources):
+        for s, sc in zip(sources, per_source_scores):
+            source_score_map[s.url] = sc
+    for i, (src, x, y, rot, type_label) in enumerate(ordered[:4]):
+        # Determine role based on trust level
+        if src.trust == "low":
+            role = "hostile"
+        else:
+            role = "amplifier"
+        # Extract a readable author name
+        if " - " in src.title:
+            author = src.title.split(" - ")[-1].strip()[:30]
+        elif "wikipedia.org" in src.url:
+            author = "WIKIPEDIA"
+        else:
+            author = src.title[:30] if src.title else "Unknown Source"
+        days_ago = rng.randint(1, 14)
+        ts = (now - timedelta(days=days_ago)).strftime("%Y-%m-%d %H:%M")
+        node_id = f"source_{i + 1}"
+        nodes.append(FrontendEvidenceNode(
+            id=node_id,
+            role=role,
+            type=type_label,
+            date=ts,
+            author=author,
+            content=src.snippet[:150] if src.snippet else src.title,
+            x=x,
+            y=y,
+            rotation=rot,
+            url=src.url if src.url else None,
+        ))
+        # ── Connection: source → claim with per-source NLI ───────────────
+        src_score = source_score_map.get(src.url, verification_score)
+        nli_type = "entailment" if src_score >= 0.65 else "contradiction"
+        nli_score = max(10, min(99, int(src_score * 100)))
+        conns.append(FrontendConnection(
+            from_field=node_id,
+            to="claim_0",
+            nli={"type": nli_type, "score": nli_score},
+        ))
+    return nodes, conns
+def _extract_ground_truth_string(sources: list) -> str:
+    """Extract the established fact string from the highest-trust source."""
+    if not sources:
+        return "No established fact could be determined from available sources."
+    # Prefer Wikipedia first
+    for s in sources:
+        if "wikipedia.org" in s.url:
+            return s.snippet[:300] if s.snippet else s.title
+    # Then any high-trust source
+    for s in sources:
+        if s.trust == "high" and s.snippet:
+            return s.snippet[:300]
+    # Fallback to first source with a snippet
+    for s in sources:
+        if s.snippet:
+            return s.snippet[:300]
+    return "No established fact could be determined from available sources."
+def _build_segments(
+    text: str,
+    suspicious: dict,
+    ground_truth: GroundTruthData,
+    ml_label: str = "",
+    ml_confidence: float = 0.0,
+) -> list[FrontendSegment]:
+    """
+    Split the claim text into annotated segments.
+    Prepends a Linguistic Analysis segment with the ML model's reasoning,
+    then uses suspicious phrase detection + ground truth evidence.
+    """
+    segments: list[FrontendSegment] = []
+    # ── Segment 0: ML Model Linguistic Analysis ──────────────────────────
+    if ml_label:
+        ml_label_display = ml_label.upper()
+        ml_pct = int(ml_confidence * 100)
+        if ml_label_display == "FAKE":
+            ml_note = (
+                f"The local NLP model analyzed the linguistic syntax and scored "
+                f"this claim at {ml_pct}% FAKE due to sensationalist phrasing, "
+                f"emotional manipulation, or patterns consistent with disinformation."
+            )
+        elif ml_label_display == "REAL":
+            ml_note = (
+                f"The local NLP model analyzed the linguistic syntax and scored "
+                f"this claim at {ml_pct}% REAL — professional journalistic tone "
+                f"detected with minimal sensationalist markers."
+            )
+        else:
+            ml_note = (
+                f"The local NLP model analyzed the linguistic syntax but could "
+                f"not reach a definitive conclusion (confidence: {ml_pct}%). "
+                f"The text contains a mix of professional and informal language patterns."
+            )
+        segments.append(FrontendSegment(
+            text=f"[LINGUISTIC ANALYSIS] ",
+            isSuspicious=True,
+            annotation=FrontendAnnotation(type="unverified", note=ml_note),
+        ))
+    # ── Collect evidence items as potential annotations ───────────────────
+    evidence_annotations: list[tuple[str, str]] = []
+    for item in ground_truth.evidence_items:
+        evidence_annotations.append((item.badge, item.text))
+    sus_phrases: list[str] = []
+    for key in ["clickbait_phrases", "emotional_language", "unsupported_claims"]:
+        sus_phrases.extend(suspicious.get(key, []))
+    import re as _re
+    sentences = _re.split(r'(?<=[.!?])\s+', text.strip())
+    if not sentences:
+        segments.append(FrontendSegment(text=text, isSuspicious=False))
+        return segments
+    badge_to_annotation_type = {
+        "FALLACY": "fallacy",
+        "UNVERIFIED": "unverified",
+        "CONTRADICTION": "contradiction",
+        "CORROBORATED": "verified",
+    }
+    evidence_idx = 0
+    for sentence in sentences:
+        sentence_text = sentence.strip()
+        if not sentence_text:
+            continue
+        if not sentence_text.endswith(" "):
+            sentence_text += " "
+        is_sus = any(phrase.lower() in sentence_text.lower() for phrase in sus_phrases)
+        if not is_sus and evidence_idx < len(evidence_annotations) and len(sentences) <= 5:
+            is_sus = True
+        annotation = None
+        if is_sus and evidence_idx < len(evidence_annotations):
+            badge, note = evidence_annotations[evidence_idx]
+            ann_type = badge_to_annotation_type.get(badge, "unverified")
+            annotation = FrontendAnnotation(type=ann_type, note=note)
+            evidence_idx += 1
+        segments.append(FrontendSegment(
+            text=sentence_text,
+            isSuspicious=is_sus and annotation is not None,
+            annotation=annotation,
+        ))
+    return segments
+def _build_confidence_explanation(
+    ml_label: str,
+    ml_confidence: float,
+    similarity_score: float,
+    num_sources: int,
+    high_trust_count: int,
+    low_trust_count: int,
+    final_prediction: str,
+    final_confidence: int,
+    wiki_verified: bool,
+) -> str:
+    """Build a highly detailed, analytical explanation of how the confidence score was derived."""
+    parts: list[str] = []
+    # ── 1. ML Model analysis ─────────────────────────────────────────────
+    ml_pct = int(ml_confidence * 100)
+    parts.append(
+        f"STEP 1 — LINGUISTIC ANALYSIS: The local DistilBERT NLP model "
+        f"classified the text as {ml_label.upper()} with {ml_pct}% internal "
+        f"confidence after analyzing syntax patterns, sensationalist markers, "
+        f"and journalistic tone indicators."
+    )
+    # ── 2. Cross-Encoder verification ────────────────────────────────────
+    sim_pct = int(similarity_score * 100)
+    threshold_met = "PASSED" if similarity_score >= 0.65 else "FAILED"
+    parts.append(
+        f"STEP 2 — CROSS-ENCODER VERIFICATION: A live internet scan retrieved "
+        f"{num_sources} source(s). The Cross-Encoder semantic similarity scored "
+        f"{sim_pct}% against the 65% entailment threshold ({threshold_met}). "
+        f"{'Wikipedia independently corroborated the claim.' if wiki_verified else 'No Wikipedia corroboration was found.'}"
+    )
+    # ── 3. Source trust breakdown ─────────────────────────────────────────
+    medium_trust = num_sources - high_trust_count - low_trust_count
+    parts.append(
+        f"STEP 3 — SOURCE TRUST AUDIT: Of {num_sources} sources, "
+        f"{high_trust_count} rated HIGH trust, {medium_trust} rated MEDIUM, "
+        f"and {low_trust_count} rated LOW. "
+        f"{'A strong evidence base supports this verdict.' if high_trust_count >= 2 else 'The evidence base is limited, which affects overall confidence.'}"
+    )
+    # ── 4. Guardrail activations ─────────────────────────────────────────
+    guardrails: list[str] = []
+    if num_sources == 0:
+        guardrails.append("ZERO-EVIDENCE PENALTY (no sources found, verdict forced to FABRICATED)")
+    if final_prediction == "Uncertain" and similarity_score < 0.78 and not wiki_verified:
+        guardrails.append("MUDDY WATERS GUARDRAIL (weak corroboration, verdict shifted to INCONCLUSIVE)")
+    if guardrails:
+        parts.append(f"STEP 4 — GUARDRAILS TRIGGERED: {'; '.join(guardrails)}.")
+    else:
+        parts.append("STEP 4 — GUARDRAILS: No safety overrides were triggered. The verdict reflects the raw analysis.")
+    # ── 5. Final synthesis ───────────────────────────────────────────────
+    parts.append(
+        f"FINAL SYNTHESIS: Combining the ML model's {ml_label.upper()} signal, "
+        f"the {sim_pct}% semantic match, and {num_sources} source(s), the system "
+        f"arrived at a final confidence of {final_confidence}%."
+    )
+    return " ▸ ".join(parts)
+# ── Endpoints ───────────────────────────────────────────────────────────────
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "service": "VeriLens AI"}
+@app.post("/analyze", response_model=AnalyzeResponse)
+async def analyze(req: AnalyzeRequest):
+    raw = req.input.strip()
+    if not raw:
+        raise HTTPException(status_code=400, detail="Input cannot be empty.")
+    t0 = time.time()
+    if _is_url(raw):
+        input_type = "URL"
+        try:
+            article = extract_article(raw)
+            text = f"{article.title}. {article.text}"
+        except ValueError as exc:
+            raise HTTPException(status_code=422, detail=str(exc))
+    else:
+        input_type = "TEXT"
+        text = raw
+    language = detect_language(text)
+    keywords = extract_keywords(text, top_n=8)
+    suspicious = detect_suspicious_phrases(text)
+    search_query = build_search_query(text)
+    ml_result = classify(text)
+    verification = await verify_claim(text, search_query)
+    high_trust = sum(1 for s in verification.sources if s.trust == "high")
+    low_trust = sum(1 for s in verification.sources if s.trust == "low")
+    # ── Decision ────────────────────────────────────────────────────────────
+    decision = make_decision(
+        ml_label=ml_result.label,
+        ml_confidence=ml_result.confidence,
+        similarity_score=verification.similarity_score,
+        sources_verified=verification.verified,
+        suspicious_info=suspicious,
+        high_trust_count=high_trust,
+        low_trust_count=low_trust,
+    )
+    final_prediction = str(decision.prediction).title()  # .title() makes it "Real", "Fake", or "Uncertain"
+    final_confidence = int(decision.confidence)
+    final_explanation = str(decision.explanation)
+     # 🕵️ Check if Wikipedia is one of the verified sources
+    wiki_verified = any("wikipedia.org" in s.url for s in verification.sources)
+    # 🛡️ THE BULLETPROOF ZERO-EVIDENCE PENALTY (The "Ojas" Rule) 🛡️
+    # Catch both Real and Uncertain guesses if there is NO evidence
+    if final_prediction in ["Real", "Uncertain"] and len(verification.sources) == 0:
+        logger.warning("Zero-Evidence Penalty triggered! Overriding AI verdict.")
+        final_prediction = "Fake"
+        final_confidence = 10  # This forces the UI bar to "Unreliable" (RED)
+        final_explanation = "The AI text analysis found no sensationalism, but a live internet scan found ZERO evidence to support this claim. In journalism, a total lack of corroboration for a statement indicates it is unverified or FAKE."
+    # 🛡️ NEW: THE "MUDDY WATERS" GUARDRAIL 🛡️
+    # If the AI says REAL, but the internet context match is weak/moderate (< 0.78)
+    elif final_prediction == "Real" and verification.similarity_score < 0.78 and not wiki_verified:
+        logger.warning("Muddy Waters Guardrail triggered! Weak internet corroboration.")
+        final_prediction = "Uncertain"
+        final_confidence = 50  # Pushes UI perfectly to the center YELLOW
+        final_explanation = "The AI detected a professional journalistic tone, and related topics were found online. However, the EXACT claim could not be highly corroborated by the Cross-Encoder. This may be a misleading mix of real entities and fake events."
+    # ── Build supplementary data for Figma dashboard ────────────────────
+    source_outs = [SourceOut(title=s.title, url=s.url, snippet=s.snippet, trust=s.trust)
+                   for s in verification.sources]
+    verdict_label = _VERDICT_MAP.get(final_prediction, "UNDER REVIEW")
+    case_number = _generate_case_number(text)
+    origin_map = _build_origin_map(verification.sources, verification.similarity_score, text)
+    ground_truth = _build_ground_truth(
+        final_prediction, final_explanation, suspicious, keywords, verification.sources
+    )
+    # ── Build frontend-compatible structures ─────────────────────────────
+    frontend_verdict = _FRONTEND_VERDICT_MAP.get(final_prediction, "INCONCLUSIVE")
+    frontend_source_tree, frontend_connections = _build_direct_source_tree(
+        text, verification.sources, verification.similarity_score,
+    )
+    frontend_segments = _build_segments(
+        text, suspicious, ground_truth,
+        ml_label=ml_result.label, ml_confidence=ml_result.confidence,
+    )
+    ground_truth_string = _extract_ground_truth_string(verification.sources)
+    # ── Build the detailed confidence explanation ─────────────────────────
+    confidence_explanation = _build_confidence_explanation(
+        ml_label=ml_result.label,
+        ml_confidence=ml_result.confidence,
+        similarity_score=verification.similarity_score,
+        num_sources=len(verification.sources),
+        high_trust_count=high_trust,
+        low_trust_count=low_trust,
+        final_prediction=final_prediction,
+        final_confidence=final_confidence,
+        wiki_verified=wiki_verified,
+    )
+    elapsed = int((time.time() - t0) * 1000)
+    return AnalyzeResponse(
+        input_type=input_type,
+        prediction=final_prediction,
+        confidence=final_confidence,
+        explanation=final_explanation,
+        sources=source_outs,
+        language=language,
+        keywords=keywords,
+        suspicious=suspicious,
+        factors=decision.factors,
+        elapsed_ms=elapsed,
+        verdict_label=verdict_label,
+        case_number=case_number,
+        origin_map=origin_map,
+        ground_truth=ground_truth,
+        # ── Frontend fields ──────────────────────────────────────────────
+        claim=text,
+        verdict=frontend_verdict,
+        segments=frontend_segments,
+        sourceTree=frontend_source_tree,
+        connections=frontend_connections,
+        groundTruth=ground_truth_string,
+        confidenceExplanation=confidence_explanation,
+    )

model.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+ML Classifier for VeriLens AI
+Primary:  HuggingFace text-classification pipeline (DistilBERT).
+Fallback: Heuristic keyword-based scoring when the model is unavailable.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+logger = logging.getLogger(__name__)
+# ── Lazy-loaded globals ─────────────────────────────────────────────────────
+_pipeline = None
+_model_ready = False
+@dataclass
+class ClassificationResult:
+    label: str          # "FAKE" or "REAL"
+    confidence: float   # 0.0 – 1.0
+# ── Heuristic fallback ─────────────────────────────────────────────────────
+_FAKE_SIGNALS = [
+    "you won't believe", "shocking", "exposed", "secret",
+    "they don't want you to know", "mind-blowing", "conspiracy",
+    "cover-up", "banned", "censored", "wake up", "big pharma",
+    "doctors hate", "one weird trick", "must watch",
+    "share before it's too late", "mainstream media won't tell you",
+    "spread this before it's deleted", "bombshell", "unbelievable",
+]
+_REAL_SIGNALS = [
+    "according to", "officials said", "the report states",
+    "data shows", "peer-reviewed", "study published",
+    "reuters", "associated press", "confirmed by",
+    "government statement", "press release", "research findings",
+    "published in the journal", "the investigation found",
+]
+def _heuristic_classify(text: str) -> ClassificationResult:
+    """Simple keyword-based scoring used when the transformer is unavailable."""
+    lower = text.lower()
+    fake_hits = sum(1 for p in _FAKE_SIGNALS if p in lower)
+    real_hits = sum(1 for p in _REAL_SIGNALS if p in lower)
+    total = fake_hits + real_hits
+    if total == 0:
+        return ClassificationResult(label="UNCERTAIN", confidence=0.50)
+    fake_ratio = fake_hits / total
+    if fake_ratio > 0.6:
+        return ClassificationResult(label="FAKE", confidence=round(0.5 + fake_ratio * 0.4, 2))
+    if fake_ratio < 0.4:
+        return ClassificationResult(label="REAL", confidence=round(0.5 + (1 - fake_ratio) * 0.4, 2))
+    return ClassificationResult(label="UNCERTAIN", confidence=0.55)
+# ── Model loading ──────────────────────────────────────────────────────────
+_LOCAL_MODEL_DIR = Path(__file__).resolve().parent / "trained_model_v2"
+def load_model() -> None:
+    """
+    Load the text-classification pipeline.
+    Prefers a locally fine-tuned model from ./trained_model if it exists,
+    otherwise falls back to the HuggingFace remote model.
+    Call once at startup; subsequent calls are no-ops.
+    """
+    global _pipeline, _model_ready
+    if _model_ready:
+        return
+    try:
+        from transformers import pipeline as hf_pipeline
+        import torch
+        # ⚡ Universal Hardware Detection (Windows / Mac / Linux)
+        if torch.cuda.is_available():
+            active_device = torch.device("cuda")
+            gpu_name = torch.cuda.get_device_name(0)
+            logger.info(f"Hardware detection: NVIDIA GPU ({gpu_name}) found. Routing to CUDA.")
+        elif torch.backends.mps.is_available():
+            active_device = torch.device("mps")
+            logger.info("Hardware detection: Apple Silicon found. Routing to MPS.")
+        else:
+            active_device = torch.device("cpu")
+            logger.info("Hardware detection: No GPU found. Defaulting to CPU.")
+        if _LOCAL_MODEL_DIR.exists() and (_LOCAL_MODEL_DIR / "config.json").exists():
+            model_path = str(_LOCAL_MODEL_DIR)
+            logger.info("Loading locally trained model from %s …", model_path)
+        else:
+            model_path = "hamzab/roberta-fake-news-classification"
+            logger.info("Loading HuggingFace remote model: %s …", model_path)
+        # ⚡ Pass the dynamically selected device to the pipeline
+        _pipeline = hf_pipeline(
+            "text-classification",
+            model=model_path,
+            truncation=True,
+            max_length=512,
+            device=active_device
+        )
+        _model_ready = True
+        logger.info("Model loaded successfully.")
+    except Exception as exc:
+        logger.warning("Could not load model (%s). Using heuristic fallback.", exc)
+        _model_ready = False
+def classify(text: str) -> ClassificationResult:
+    """
+    Classify *text* as REAL or FAKE.
+    Falls back to heuristic scoring if the transformer model is unavailable.
+    """
+    if not _model_ready or _pipeline is None:
+        return _heuristic_classify(text)
+    try:
+        # Truncate very long texts for speed
+        truncated = text[:2048]
+        result = _pipeline(truncated)[0]
+        raw_label: str = result["label"].upper()
+        score: float = result["score"]
+        # Normalise labels coming from the model
+        if "FAKE" in raw_label or raw_label in ("LABEL_0", "FAKE"):
+            label = "FAKE"
+        elif "REAL" in raw_label or raw_label in ("LABEL_1", "REAL"):
+            label = "REAL"
+        else:
+            label = "UNCERTAIN"
+        return ClassificationResult(label=label, confidence=round(score, 4))
+    except Exception as exc:
+        logger.error("Model inference failed: %s – falling back to heuristic.", exc)
+        return _heuristic_classify(text)

nlp_utils.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+NLP Utilities for VeriLens AI
+- Text preprocessing (lowercasing, stopword removal, tokenization)
+- Keyword extraction for search queries
+- Suspicious phrase detection
+- Language detection (English / Hindi)
+"""
+import re
+import string
+# ── stopwords (lightweight, no NLTK download needed) ────────────────────────
+ENGLISH_STOPWORDS = {
+    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
+    "have", "has", "had", "do", "does", "did", "will", "would", "could",
+    "should", "may", "might", "shall", "can", "need", "dare", "ought",
+    "used", "to", "of", "in", "for", "on", "with", "at", "by", "from",
+    "as", "into", "through", "during", "before", "after", "above", "below",
+    "between", "out", "off", "over", "under", "again", "further", "then",
+    "once", "here", "there", "when", "where", "why", "how", "all", "both",
+    "each", "few", "more", "most", "other", "some", "such", "no", "nor",
+    "not", "only", "own", "same", "so", "than", "too", "very", "just",
+    "because", "but", "and", "or", "if", "while", "about", "up", "its",
+    "it", "he", "she", "they", "we", "you", "i", "me", "him", "her",
+    "us", "them", "my", "your", "his", "our", "their", "this", "that",
+    "these", "those", "what", "which", "who", "whom", "s", "t", "don",
+    "didn", "doesn", "hadn", "hasn", "haven", "isn", "wasn", "weren",
+    "won", "wouldn", "couldn", "shouldn", "ain", "aren", "re", "ve", "ll",
+}
+# ── suspicious / clickbait phrases ──────────────────────────────────────────
+CLICKBAIT_PHRASES = [
+    "you won't believe",
+    "shocking",
+    "breaking",
+    "exposed",
+    "secret",
+    "they don't want you to know",
+    "what they're hiding",
+    "mind-blowing",
+    "jaw-dropping",
+    "unbelievable",
+    "gone wrong",
+    "doctors hate",
+    "one weird trick",
+    "this will change everything",
+    "spread this before it's deleted",
+    "mainstream media won't tell you",
+    "exposed the truth",
+    "wake up",
+    "big pharma",
+    "conspiracy",
+    "cover-up",
+    "coverup",
+    "bombshell",
+    "urgent",
+    "must watch",
+    "must read",
+    "share before it's too late",
+    "banned",
+    "censored",
+]
+EMOTIONAL_PHRASES = [
+    "absolutely",
+    "totally",
+    "completely",
+    "utterly",
+    "extremely",
+    "terrifying",
+    "horrifying",
+    "devastating",
+    "outrageous",
+    "disgusting",
+    "insane",
+    "crazy",
+    "incredible",
+    "miraculous",
+    "phenomenal",
+    "unprecedented",
+    "never before seen",
+    "the truth about",
+    "exposed",
+    "the real story",
+]
+UNSUPPORTED_CLAIM_MARKERS = [
+    "sources say",
+    "experts believe",
+    "studies show",
+    "according to sources",
+    "rumor has it",
+    "allegedly",
+    "it is believed",
+    "some people say",
+    "many believe",
+    "reports suggest",
+    "anonymous sources",
+    "unnamed officials",
+    "insiders reveal",
+]
+# ── Hindi character range for language detection ────────────────────────────
+HINDI_PATTERN = re.compile(r"[\u0900-\u097F]")
+def preprocess_text(text: str) -> str:
+    """Lowercase, remove punctuation, remove stopwords."""
+    text = text.lower()
+    text = text.translate(str.maketrans("", "", string.punctuation))
+    tokens = text.split()
+    tokens = [t for t in tokens if t not in ENGLISH_STOPWORDS]
+    return " ".join(tokens)
+def extract_keywords(text: str, top_n: int = 10) -> list[str]:
+    """Return the most frequent non-stopword tokens."""
+    cleaned = preprocess_text(text)
+    tokens = cleaned.split()
+    freq: dict[str, int] = {}
+    for t in tokens:
+        if len(t) > 2:
+            freq[t] = freq.get(t, 0) + 1
+    sorted_tokens = sorted(freq, key=freq.get, reverse=True)  # type: ignore
+    return sorted_tokens[:top_n]
+import re
+import re
+def build_search_query(text: str) -> str:
+    """
+    Strips conversational filler, internet slang, and extracts the core claim for a laser-focused web search.
+    """
+    # 1. Massive list of conversational filler, clickbait, and Gen Z slang phrases
+    fillers = [
+        # News/WhatsApp filler
+        "is it true that", "i heard that", "someone told me", "can you check if",
+        "they are saying", "breaking news", "shocking", "whatsapp forward",
+        "forwarded as received", "please verify", "pls verify", "can you verify",
+        "fact check this", "tell me if", "did you hear", "rumor has it",
+        "watch till the end", "viral video", "secret exposed", "must watch",
+        "mind blowing", "i read somewhere", "is this real", "is this fake",
+        "check this news", "verify this claim", "you won't believe",
+        "alert:", "warning:", "urgent:", "fwd:", "bro is it true", "bhau tell me",
+        # Gen Z / Internet Slang Phrases
+        "no cap", "fr fr", "on god", "spill the tea", "is it giving",
+        "big yikes", "to be honest", "not gonna lie", "out of pocket",
+        "let him cook", "make it make sense", "rent free", "touch grass",
+        "caught in 4k", "main character energy", "pop off", "periodt",
+        "for real", "deadass", "lowkey", "highkey", "tbh", "ngl", "chat is this real",
+        "make it viral"
+    ]
+    clean_text = text.lower()
+    for filler in fillers:
+        clean_text = clean_text.replace(filler, " ")
+    # 2. Keep only alphanumeric words
+    words = re.findall(r'\b\w+\b', clean_text)
+    # 3. Comprehensive English Stop Words + Gen Z "Brainrot" Dictionary
+    stop_words = {
+        # Standard English NLP Stop Words
+        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
+        "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she",
+        "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
+        "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
+        "these", "those", "am", "is", "are", "was", "were", "be", "been", "being",
+        "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
+        "the", "and", "but", "if", "or", "because", "as", "until", "while", "of",
+        "at", "by", "for", "with", "about", "against", "between", "into", "through",
+        "during", "before", "after", "above", "below", "to", "from", "up", "down",
+        "in", "out", "on", "off", "over", "under", "again", "further", "then",
+        "once", "here", "there", "when", "where", "why", "how", "all", "any",
+        "both", "each", "few", "more", "most", "other", "some", "such", "no",
+        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
+        "t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o",
+        "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn",
+        "haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn",
+        "wasn", "weren", "won", "wouldn", "tell", "know", "think", "believe",
+        "say", "said", "saying", "ask", "asked", "check", "news", "today", "new",
+        # Gen Z / Internet Slang Single Words
+        "fr", "cap", "bruh", "bro", "dude", "rn", "skibidi", "rizz", "sigma",
+        "bet", "af", "smh", "idk", "idc", "lmao", "lmfao", "lol", "rofl", "omg",
+        "sus", "legit", "bussin", "yall", "based", "cringe", "ratio", "gyatt",
+        "mewing", "lit", "fire", "tea", "dub", "flop", "iykyk", "literally",
+        "actually", "basically", "seriously", "like", "yap", "yapping",
+        "delulu", "solulu", "pookie", "aura", "chat", "fyi", "lmk", "tldr"
+    }
+    # Filter out the stop words and slang
+    core_keywords = [word for word in words if word not in stop_words]
+    # 4. Limit to top 8 keywords so Google News doesn't get overwhelmed
+    final_query = " ".join(core_keywords[:8])
+    # Fallback just in case they typed nothing but slang/stop words
+    return final_query if final_query.strip() else text[:50]
+def detect_language(text: str) -> str:
+    """Detect if text is primarily Hindi or English."""
+    hindi_chars = len(HINDI_PATTERN.findall(text))
+    total_alpha = sum(1 for c in text if c.isalpha())
+    if total_alpha == 0:
+        return "en"
+    if hindi_chars / total_alpha > 0.3:
+        return "hi"
+    return "en"
+def detect_suspicious_phrases(text: str) -> dict:
+    """Scan text for clickbait, emotional, and unsupported-claim markers."""
+    lower = text.lower()
+    found_clickbait = [p for p in CLICKBAIT_PHRASES if p in lower]
+    found_emotional = [p for p in EMOTIONAL_PHRASES if p in lower]
+    found_unsupported = [p for p in UNSUPPORTED_CLAIM_MARKERS if p in lower]
+    total = len(found_clickbait) + len(found_emotional) + len(found_unsupported)
+    return {
+        "clickbait_phrases": found_clickbait,
+        "emotional_language": found_emotional,
+        "unsupported_claims": found_unsupported,
+        "total_suspicious_count": total,
+    }
+def tokenize(text: str) -> list[str]:
+    """Simple whitespace + punctuation tokenizer."""
+    text = text.lower()
+    text = re.sub(r"[^\w\s]", " ", text)
+    return text.split()

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+# ── VeriLens AI V2 Requirements ──────────────────────────────────────────────
+# Web Server & API
+fastapi==0.115.0
+uvicorn[standard]==0.30.6
+pydantic>=2.0.0
+python-dotenv==1.0.1
+httpx==0.27.2
+# Modern Web Scraping (Replaces newspaper3k)
+trafilatura>=1.12.0
+lxml-html-clean==0.4.1
+# Machine Learning & Transformers
+torch==2.4.1
+transformers==4.44.2
+sentence-transformers==3.0.1
+scikit-learn==1.5.1
+numpy>=1.24.0
+pandas>=2.0.0
+# OS & Internet Tools
+duckduckgo-search>=7.0.0
+certifi

scraper.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+Web Scraper for VeriLens AI (V2 - Trafilatura Engine)
+Uses the modern trafilatura library to bypass bot-blockers,
+strip out cookie banners, and extract pristine article text for NLP.
+"""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass
+import trafilatura
+logger = logging.getLogger(__name__)
+@dataclass
+class ScrapedArticle:
+    title: str
+    text: str
+    authors: list[str]
+    publish_date: str | None
+    source_url: str
+def extract_article(url: str) -> ScrapedArticle:
+    """
+    Download and parse a news article from *url* using Trafilatura.
+    Raises ValueError on failure or if the site aggressively blocks scraping.
+    """
+    logger.info(f"Attempting to scrape URL: {url}")
+    # 1. Fetch the raw HTML (Trafilatura handles redirects and headers automatically)
+    downloaded = trafilatura.fetch_url(url)
+    if downloaded is None:
+        logger.error(f"Fetch failed for {url}. The site may be down or actively blocking bots.")
+        raise ValueError("Could not access URL. The site may be blocking automated requests or is invalid.")
+    # 2. Extract the text and metadata (bare_extraction returns a dictionary)
+    # We disable comments and tables to keep the text as pure as possible for the AI.
+    extracted = trafilatura.bare_extraction(
+        downloaded,
+        include_comments=False,
+        include_tables=False
+    )
+    # 3. Guardrail: Did we actually get text?
+    if extracted is None or not extracted.get('text') or len(extracted.get('text', '').strip()) < 50:
+        logger.warning(f"Extraction failed or returned too little text for {url}")
+        raise ValueError(
+            "Extracted article content is too short or empty. "
+            "The URL may be a video, a paywalled article, or heavily obfuscated with JavaScript."
+        )
+    # 4. Clean up the metadata
+    title = extracted.get('title') or "Unknown Title"
+    text = extracted.get('text', '')
+    date = extracted.get('date')
+    # Trafilatura usually returns authors as a single string separated by semicolons or commas
+    raw_author = extracted.get('author')
+    if raw_author:
+        # Split by comma or semicolon and clean up whitespace
+        authors = [a.strip() for a in raw_author.replace(';', ',').split(',') if a.strip()]
+    else:
+        authors = []
+    logger.info(f"Successfully scraped: '{title[:30]}...' ({len(text)} characters)")
+    return ScrapedArticle(
+        title=title,
+        text=text,
+        authors=authors,
+        publish_date=date,
+        source_url=url,
+    )

trained_model_v2/config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "FAKE",
+    "1": "REAL"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "FAKE": 0,
+    "REAL": 1
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "vocab_size": 30522
+}

trained_model_v2/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88d5997db34a6989bc93d791e3f16f0e8a330b449f3cab3bc064057bd9e1e2d3
+size 267832560

trained_model_v2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

trained_model_v2/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trained_model_v2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

trained_model_v2/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

verifier.py ADDED Viewed

	@@ -0,0 +1,422 @@

+"""
+Internet Verifier for VeriLens AI
+- Searches the web via Google News RSS for live, rate-limit-proof verification.
+- Searches Wikipedia API for historical fact verification.
+- Computes strict semantic entailment using a Cross-Encoder.
+"""
+from __future__ import annotations
+import urllib.request
+import urllib.parse
+import xml.etree.ElementTree as ET
+import re
+import json  # <-- Added for Wikipedia API
+import numpy as np  # <-- Added for softmax over NLI logits
+import asyncio
+import logging
+from dataclasses import dataclass, field
+logger = logging.getLogger(__name__)
+# ── Lazy-loaded Cross-Encoder ────────────────────────────────────────
+_cross_model = None
+def _get_cross_model():
+    global _cross_model
+    if _cross_model is None:
+        try:
+            from sentence_transformers import CrossEncoder
+            logger.info("Loading Multilingual NLI Cross-Encoder model…")
+            # ⚡ Multilingual mDeBERTa — supports 100+ languages for global claim verification
+            # Label order: [entailment=0, neutral=1, contradiction=2]
+            _cross_model = CrossEncoder("MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
+            logger.info("Multilingual NLI Cross-Encoder loaded successfully.")
+        except Exception as exc:
+            logger.warning("Could not load NLI Cross-Encoder: %s", exc)
+    return _cross_model
+@dataclass
+class SourceArticle:
+    title: str
+    url: str
+    snippet: str
+    trust: str = "medium"  # "high", "medium", "low"
+@dataclass
+class VerificationResult:
+    similarity_score: float = 0.0
+    sources: list[SourceArticle] = field(default_factory=list)
+    verified: bool = False
+# ── Trusted domains (Expanded Global & Indian Scope) ───────────────────────
+HIGH_TRUST_DOMAINS = {
+    "wikipedia.org",  # <-- Added Wikipedia as a Ground Truth Source
+    # 🌍 Global Wire Services (The original sources of most news)
+    "reuters.com", "apnews.com", "bloomberg.com", "afp.com", "upi.com",
+    # 🇺🇸/🇬🇧 Major US, UK & International Media
+    "bbc.com", "bbc.co.uk", "nytimes.com", "washingtonpost.com", "wsj.com",
+    "theguardian.com", "npr.org", "pbs.org", "cnn.com", "ft.com",
+    "aljazeera.com", "dw.com", "france24.com", "scmp.com", "nbcnews.com",
+    "cbsnews.com", "abcnews.go.com", "theatlantic.com", "time.com", "economist.com",
+    # 🇮🇳 Indian National & Regional Heavyweights
+    "thehindu.com", "hindustantimes.com", "indianexpress.com", "timesofindia.indiatimes.com",
+    "ndtv.com", "indiatoday.in", "theprint.in", "thewire.in", "scroll.in",
+    "livemint.com", "business-standard.com", "deccanherald.com", "telegraphindia.com",
+    "tribuneindia.com", "newindianexpress.com", "firstpost.com", "thequint.com",
+    "cnbctv18.com", "moneycontrol.com", "aninews.in", "ptinews.com", "freepressjournal.in",
+    # 🔎 Dedicated Fact-Checkers (Massive Trust Boost if found)
+    "snopes.com", "politifact.com", "factcheck.org", "altnews.in", "boomlive.in",
+    "newschecker.in", "vishvasnews.com", "smhoaxinvestigator.com", "factchecker.in",
+    # 🌐 High-Trust Aggregators
+    "yahoo.com/news", "msn.com", "news.google.com"
+}
+# ── Low Trust / Disinformation / Satire domains ────────────────────────────
+LOW_TRUST_DOMAINS = {
+    # ⚠️ Known Fake News, Pseudoscience & Conspiracy
+    "infowars.com", "naturalnews.com", "beforeitsnews.com", "thegatewaypundit.com",
+    "zerohedge.com", "worldnewsdailyreport.com", "nationalreport.net",
+    # 📢 State-Sponsored Propaganda
+    "rt.com", "sputniknews.com", "globaltimes.cn",
+    # 🇮🇳 Indian High-Bias / Frequently Flagged for Disinformation
+    "postcard.news", "opindia.com", "tfipost.com", "kreately.in", "rightlog.in",
+    # 🤡 Satire (If your engine matches these, the news is definitely fake)
+    "theonion.com", "babylonbee.com", "fakingnews.com", "thefauxy.com",
+    "thedailymash.co.uk", "waterfordwhispersnews.com", "clickhole.com"
+}
+def _trust_level(url: str, snippet: str = "", title: str = "") -> str:
+    """Evaluates trust based on URL domain AND snippet/title signatures."""
+    lower_url = url.lower()
+    lower_snippet = snippet.lower()
+    lower_title = title.lower()
+    # 1. Check URL Domains
+    for d in HIGH_TRUST_DOMAINS:
+        if d in lower_url:
+            return "high"
+    # 2. Check snippet OR title for major syndicated wire services
+    high_trust_keywords = ["reuters", "associated press", "bbc", "cnn", "the new york times", "bloomberg"]
+    for keyword in high_trust_keywords:
+        if keyword in lower_snippet or keyword in lower_title:
+            return "high"
+    # 3. Check for known low-trust/satire sites
+    for d in LOW_TRUST_DOMAINS:
+        if d in lower_url:
+            return "low"
+    return "medium"
+# ── Locale detection for multilingual search ─────────────────────────────
+_LOCALE_MAP = {
+    (0x0900, 0x097F): ('hi', 'IN'),   # Devanagari → Hindi
+    (0x0980, 0x09FF): ('bn', 'IN'),   # Bengali
+    (0x0A00, 0x0A7F): ('pa', 'IN'),   # Gurmukhi → Punjabi
+    (0x0A80, 0x0AFF): ('gu', 'IN'),   # Gujarati
+    (0x0B80, 0x0BFF): ('ta', 'IN'),   # Tamil
+    (0x0C00, 0x0C7F): ('te', 'IN'),   # Telugu
+    (0x0C80, 0x0CFF): ('kn', 'IN'),   # Kannada
+    (0x0D00, 0x0D7F): ('ml', 'IN'),   # Malayalam
+    (0x0600, 0x06FF): ('ar', 'AE'),   # Arabic
+    (0x4E00, 0x9FFF): ('zh', 'CN'),   # CJK → Chinese
+    (0x3040, 0x30FF): ('ja', 'JP'),   # Hiragana/Katakana → Japanese
+    (0xAC00, 0xD7AF): ('ko', 'KR'),   # Hangul → Korean
+    (0x0400, 0x04FF): ('ru', 'RU'),   # Cyrillic → Russian
+}
+def _detect_locale(query: str) -> tuple[str, str]:
+    """Detect (lang, country) from the Unicode script of the first non-ASCII char."""
+    for c in query:
+        cp = ord(c)
+        for (lo, hi), locale in _LOCALE_MAP.items():
+            if lo <= cp <= hi:
+                return locale
+    return ('en', 'US')  # default to English
+def _fetch_google_rss(url: str, num_results: int) -> list[dict]:
+    """Fetch and parse a Google News RSS URL into a list of result dicts."""
+    print(f"  🌐 GOOGLE NEWS URL: {url}")
+    req = urllib.request.Request(
+        url,
+        headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)'}
+    )
+    with urllib.request.urlopen(req, timeout=10) as response:
+        xml_data = response.read()
+    root = ET.fromstring(xml_data)
+    results = []
+    for item in root.findall('.//item')[:num_results]:
+        title = item.find('title')
+        link = item.find('link')
+        title_text = title.text if title is not None else ""
+        link_text = link.text if link is not None else ""
+        desc = item.find('description')
+        desc_html = desc.text if desc is not None else ""
+        snippet = re.sub('<[^<]+>', '', desc_html)
+        results.append({"title": title_text, "href": link_text, "body": snippet})
+    print(f"  📰 Results found: {len(results)}")
+    return results
+def _google_news_search(query: str, num_results: int = 8) -> list[dict]:
+    """
+    Multilingual Google News RSS search.
+    1. Detect locale from query script (Hindi→hi/IN, Bengali→bn/IN, etc.)
+    2. Search with detected locale
+    3. Fallback: search with no locale (Google auto-detects)
+    4. Fallback: slice to first 6 words and retry
+    """
+    try:
+        safe_query = urllib.parse.quote(query)
+        lang, country = _detect_locale(query)
+        print(f"\n{'='*50}")
+        print(f"🔍 GOOGLE NEWS SEARCH")
+        print(f"  Query: {query[:80]}{'...' if len(query) > 80 else ''}")
+        print(f"  Detected locale: hl={lang}, gl={country}")
+        # Attempt 1: Search with detected locale
+        url = f"https://news.google.com/rss/search?q={safe_query}&hl={lang}&gl={country}&ceid={country}:{lang}"
+        results = _fetch_google_rss(url, num_results)
+        # Attempt 2: No locale params → let Google infer
+        if not results:
+            print("  ⚠️ Zero results. Retrying with no locale params...")
+            url_nolang = f"https://news.google.com/rss/search?q={safe_query}"
+            results = _fetch_google_rss(url_nolang, num_results)
+        # Attempt 3: Query slicing → first 6 words only
+        if not results:
+            words = query.split()
+            if len(words) > 4:
+                short_query = " ".join(words[:6])
+                safe_short = urllib.parse.quote(short_query)
+                print(f"  ⚠️ Still zero. Slicing to 6 words: '{short_query}'")
+                url_short = f"https://news.google.com/rss/search?q={safe_short}&hl={lang}&gl={country}&ceid={country}:{lang}"
+                results = _fetch_google_rss(url_short, num_results)
+        print(f"  ✅ Final result count: {len(results)}")
+        print(f"{'='*50}\n")
+        return results
+    except Exception as exc:
+        logger.error("Google News search failed: %s", exc)
+        return []
+def _wikipedia_search(query: str) -> list[dict]:
+    """
+    Multilingual Wikipedia search.
+    Tries English first, then falls back to the language-specific edition
+    if the query contains non-ASCII characters.
+    """
+    def _wiki_query(wiki_lang: str, q: str) -> list[dict]:
+        safe_query = urllib.parse.quote(q)
+        url = f"https://{wiki_lang}.wikipedia.org/w/api.php?action=query&list=search&srsearch={safe_query}&utf8=&format=json"
+        print(f"  📚 WIKIPEDIA URL ({wiki_lang}): {url[:120]}...")
+        req = urllib.request.Request(
+            url,
+            headers={'User-Agent': 'VeriLensAI/1.0 (University Fact-Checking Project)'}
+        )
+        with urllib.request.urlopen(req, timeout=10) as response:
+            data = json.loads(response.read().decode())
+        results = []
+        for item in data.get('query', {}).get('search', [])[:2]:
+            title = item['title']
+            clean_snippet = re.sub('<[^<]+>', '', item['snippet'])
+            results.append({
+                "title": f"{title} - Wikipedia",
+                "href": f"https://{wiki_lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ', '_'))}",
+                "body": clean_snippet
+            })
+        print(f"  📚 Wikipedia ({wiki_lang}) results: {len(results)}")
+        return results
+    try:
+        # 1. Try English Wikipedia first
+        results = _wiki_query('en', query)
+        # 2. If 0 results and query contains non-ASCII, detect language Wikipedia
+        if not results and any(ord(c) > 127 for c in query):
+            detected_lang, _ = _detect_locale(query)
+            if detected_lang != 'en':
+                logger.info(f"Retrying Wikipedia with lang={detected_lang} for non-ASCII query")
+                results = _wiki_query(detected_lang, query)
+        return results
+    except Exception as exc:
+        logger.error("Wikipedia search failed: %s", exc)
+        return []
+async def _search_web(query: str, num_results: int = 8) -> list[dict]:
+    """Search the web for news AND historical facts concurrently, with short-query fallback."""
+    # Run Google News and Wikipedia at the exact same time
+    news_task = asyncio.to_thread(_google_news_search, query, num_results)
+    wiki_task = asyncio.to_thread(_wikipedia_search, query)
+    # Wait for both to finish
+    news_results, wiki_results = await asyncio.gather(news_task, wiki_task)
+    # Allocate half the quota to each source to ensure balanced verification
+    half_quota = num_results // 2
+    balanced_results = news_results[:half_quota] + wiki_results[:num_results - half_quota]
+    # If Wiki returned fewer results than its quota, fill the gap with more news
+    if len(balanced_results) < num_results:
+        remaining_slots = num_results - len(balanced_results)
+        balanced_results.extend(news_results[half_quota:half_quota + remaining_slots])
+    # 🔄 SHORT-QUERY FALLBACK: If 0 results, retry with just the first 6 words
+    if not balanced_results:
+        words = query.split()
+        if len(words) > 4:
+            short_query = " ".join(words[:6])
+            logger.info(f"Zero results for full query. Retrying with short query: '{short_query}'")
+            news_task2 = asyncio.to_thread(_google_news_search, short_query, num_results)
+            wiki_task2 = asyncio.to_thread(_wikipedia_search, short_query)
+            news2, wiki2 = await asyncio.gather(news_task2, wiki_task2)
+            balanced_results = news2[:half_quota] + wiki2[:num_results - half_quota]
+            if len(balanced_results) < num_results:
+                remaining_slots = num_results - len(balanced_results)
+                balanced_results.extend(news2[half_quota:half_quota + remaining_slots])
+    return balanced_results
+# NLI Entailment threshold — much stricter than old STS similarity.
+# Only sources whose articles genuinely ENTAIL the claim will pass.
+MIN_RELEVANCE_THRESHOLD = 0.75
+# Label mapping for MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7
+# Index 0 = Entailment, Index 1 = Neutral, Index 2 = Contradiction
+_NLI_ENTAILMENT_IDX = 0
+def _softmax(logits: np.ndarray) -> np.ndarray:
+    """Numerically-stable softmax over the last axis."""
+    exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
+    return exp / np.sum(exp, axis=-1, keepdims=True)
+def _compute_per_source_similarity(text: str, snippets: list[str]) -> list[float]:
+    """
+    Compute strict semantic entailment using an NLI Cross-Encoder.
+    The model outputs raw logits for [Contradiction, Entailment, Neutral].
+    We apply softmax and return the Entailment probability (0.0 → 1.0)
+    so that keyword-overlap alone can no longer fool the system.
+    """
+    model = _get_cross_model()
+    if model is None or not snippets:
+        return [0.0] * len(snippets)
+    try:
+        # Cross-Encoders take PAIRS: (premise=article, hypothesis=claim)
+        pairs = [[snippet[:512], text[:512]] for snippet in snippets]
+        # NLI models return raw logits of shape (N, 3)
+        logits = model.predict(pairs)
+        logits = np.array(logits)
+        # Ensure 2-D even for a single pair
+        if logits.ndim == 1:
+            logits = logits.reshape(1, -1)
+        # Softmax → probabilities, then grab the Entailment column
+        probs = _softmax(logits)
+        entailment_scores = probs[:, _NLI_ENTAILMENT_IDX]
+        return [float(s) for s in entailment_scores]
+    except Exception as exc:
+        logger.error("NLI entailment computation failed: %s", exc)
+        return [0.0] * len(snippets)
+async def verify_claim(text: str, search_query: str) -> VerificationResult:
+    """
+    Search the internet for articles related to *search_query*,
+    compute per-source semantic entailment, and discard irrelevant results.
+    """
+    items = await _search_web(search_query)
+    if not items:
+        return VerificationResult(similarity_score=0.0, sources=[], verified=False)
+    # Build candidate lists
+    candidates: list[SourceArticle] = []
+    snippets: list[str] = []
+    # 🔥 THE FIX: Removed the [:8] slice so Wikipedia actually gets processed!
+    for item in items:
+        title = item.get("title", "")
+        link = item.get("url", "") or item.get("href", "")
+        snippet = item.get("body", "")
+        candidates.append(
+            SourceArticle(
+                title=title,
+                url=link,
+                snippet=snippet,
+                trust=_trust_level(url=link, snippet=snippet, title=title),
+            )
+        )
+        snippets.append(f"{title}. {snippet}")
+    # Compute per-source similarity scores using the new Cross-Encoder
+    scores = await asyncio.to_thread(_compute_per_source_similarity, text, snippets)
+    # Filter: only keep sources above the relevance threshold
+    sources: list[SourceArticle] = []
+    relevant_scores: list[float] = []
+    # 🔎 X-RAY VISION: Print the AI's exact math to the backend terminal
+    print("\n" + "="*50)
+    print("🧠 CROSS-ENCODER SCORES:")
+    for candidate, score in zip(candidates, scores):
+        print(f"Score: {score:.3f} | Source: {candidate.url}")
+        # 🏛️ THE WIKIPEDIA VIP PASS 🏛️
+        if "wikipedia.org" in candidate.url:
+            required_score = 0.45  # Lower bar for encyclopedic context, but high enough to reject noise
+        else:
+            required_score = MIN_RELEVANCE_THRESHOLD  # 0.75 strict NLI entailment for news
+        if score >= required_score:
+            sources.append(candidate)
+            relevant_scores.append(score)
+            print(f"  -> ✅ ACCEPTED (Requires >= {required_score})")
+        else:
+            print(f"  -> ❌ REJECTED (Requires >= {required_score})")
+    print("="*50 + "\n")
+    if not sources:
+        return VerificationResult(similarity_score=0.0, sources=[], verified=True)
+    avg_similarity = sum(relevant_scores) / len(relevant_scores)
+    return VerificationResult(
+        similarity_score=round(avg_similarity, 4),
+        sources=sources,
+        verified=True,
+    )