Spaces:

eyesee11
/

b-ware

Sleeping

App Files Files Community

eyesee commited on Mar 4

Commit

4808fa3

1 Parent(s): fdd630d

added CORS handeling for Backend wiring and input validation issues

Browse files

Files changed (8) hide show

.gitignore +3 -1
extractor.py +42 -1
main.py +110 -14
tests/test_tier2_nli.py +242 -0
tests/test_tier3_llm.py +316 -0
tests/test_verdict_router.py +327 -0
verifier/__init__.py +45 -0
verifier/tier2_nli.py +5 -2

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 .env
 __pycache__/
-*.pyc

 .env
 __pycache__/
+*.pyc
+.pytest_cache/
+venv/

extractor.py CHANGED Viewed

@@ -17,7 +17,45 @@ Example:
 """
 import re
-from metrics import find_metric
 # extract_year(text) — Find the year in a claim
 def extract_year(text):
@@ -120,6 +158,9 @@ def _clean_number(raw):
 def extract_all(text):
     # ---- STEP 1: Extract each field independently ----
     metric_result = find_metric(text)       # Returns {"metric": ..., "confidence": ...}
     value = extract_value(text)             # Returns float or None

 """
 import re
+import html
+import unicodedata
+from metrics import find_metric
+def preprocess_claim(text: str) -> str:
+    """
+    Sanitize raw user input before extraction.
+    Handles HTML tags, whitespace noise, zero-width Unicode chars, and encoding.
+    Steps:
+      1. HTML-unescape   — "&amp;" → "&",  "&lt;" → "<"
+      2. Strip HTML tags — <b>foo</b> → foo
+      3. Remove zero-width / BOM chars  (\u200b, \u200c, \u200d, \ufeff)
+      4. NFC normalization — unify composed/decomposed Unicode forms
+      5. Collapse whitespace — tabs, newlines, multiple spaces → single space
+      6. Strip leading/trailing whitespace
+    """
+    # Step 1: HTML unescape (&amp; &lt; &gt; etc.)
+    text = html.unescape(text)
+    # Step 2: Strip HTML tags
+    text = re.sub(r"<[^>]+>", " ", text)
+    # Step 3: Remove zero-width and BOM characters
+    text = re.sub(r"[\u200b\u200c\u200d\ufeff]", "", text)
+    # Step 4: Unicode NFC normalization (e.g. é as one codepoint, not e + combining accent)
+    text = unicodedata.normalize("NFC", text)
+    # Step 5 & 6: Collapse whitespace and strip
+    text = re.sub(r"[\t\r\n]+", " ", text)   # newlines/tabs → space
+    text = re.sub(r" {2,}", " ", text)         # multiple spaces → one
+    text = text.strip()
+    return text
 # extract_year(text) — Find the year in a claim
 def extract_year(text):
 def extract_all(text):
+    # Sanitize input before anything else runs
+    text = preprocess_claim(text)
     # ---- STEP 1: Extract each field independently ----
     metric_result = find_metric(text)       # Returns {"metric": ..., "confidence": ...}
     value = extract_value(text)             # Returns float or None

main.py CHANGED Viewed

@@ -15,16 +15,28 @@ To run:
 Swagger docs: http://localhost:5001/docs
 """
 from fastapi import FastAPI, HTTPException
-from fastapi.responses import HTMLResponse
 from pydantic import BaseModel, Field
-from extractor import extract_all
-from metrics import get_all_metric_names
 from claim_detector import split_into_sentences, score_claim_probability
 from swagger_ui import get_swagger_html, tags_metadata
 from verifier.tier1_numeric import tier1_numeric_check
 from verifier.verdict_router import route_verification, VerificationResult
 # =============================================================================
 # PYDANTIC MODELS — Request/Response contracts
 # =============================================================================
@@ -62,17 +74,25 @@ class ExtractionResponse(BaseModel):
 class HealthResponse(BaseModel):
-    """Health check response."""
-    status: str
     service: str
     version: str
     model_config = {
         "json_schema_extra": {
             "example": {
                 "status": "healthy",
                 "service": "B-ware NLP Service",
-                "version": "1.0.0"
             }
         }
     }
@@ -376,8 +396,22 @@ curl -X POST http://localhost:5001/analyze \\
     docs_url=None,   # we override /docs below with custom settings
 )
 # ENDPOINTS
@@ -397,11 +431,33 @@ async def custom_swagger_ui():
     response_description="Service status and version info"
 )
 def health_check():
-    """Check if the NLP service is running and responsive."""
     return {
-        "status": "healthy",
         "service": "B-ware NLP Service",
-        "version": "1.0.0"
     }
@@ -681,7 +737,27 @@ async def verify_full(request: ClaimRequest):
     Returns the `tier_used` field so you know which layer produced the verdict.
     Use `POST /verify/deep` to force all three tiers regardless of early exit conditions.
     """
-    result: VerificationResult = await route_verification(request.text, force_tier3=False)
     return FullVerificationResult(
         original_text=result.original_text,
         tier_used=result.tier_used,
@@ -732,7 +808,27 @@ async def verify_deep(request: ClaimRequest):
     **Slower** than `/verify` — expect ~3–8 seconds latency (network + LLM).
     Subject to Gemini free-tier rate limits (15 req/min).
     """
-    result: VerificationResult = await route_verification(request.text, force_tier3=True)
     return FullVerificationResult(
         original_text=result.original_text,
         tier_used=result.tier_used,
@@ -766,8 +862,8 @@ async def verify_deep(request: ClaimRequest):
 # Run the server directly: python main.py
 if __name__ == "__main__":
     import uvicorn
-    print("Starting B-ware NLP Service...")
-    print("API docs available at: http://localhost:5001/docs")
     uvicorn.run(
         "main:app",
         host="0.0.0.0",

 Swagger docs: http://localhost:5001/docs
 """
+import asyncio
+import logging
+import os
 from fastapi import FastAPI, HTTPException
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse, JSONResponse
 from pydantic import BaseModel, Field
 from claim_detector import split_into_sentences, score_claim_probability
+from extractor import extract_all, preprocess_claim
+from metrics import get_all_metric_names
 from swagger_ui import get_swagger_html, tags_metadata
 from verifier.tier1_numeric import tier1_numeric_check
 from verifier.verdict_router import route_verification, VerificationResult
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(name)s  %(levelname)s  %(message)s",
+)
+logger = logging.getLogger("bware.nlp")
 # =============================================================================
 # PYDANTIC MODELS — Request/Response contracts
 # =============================================================================
 class HealthResponse(BaseModel):
+    """Health check response — includes component readiness."""
+    status: str           # "healthy" | "degraded"
     service: str
     version: str
+    bart_model: str       # "loaded" | "not_loaded"
+    gemini_key: str       # "configured" | "missing"
+    newsapi_key: str      # "configured" | "missing"
+    factcheck_key: str    # "configured" | "missing"
     model_config = {
         "json_schema_extra": {
             "example": {
                 "status": "healthy",
                 "service": "B-ware NLP Service",
+                "version": "1.0.0",
+                "bart_model": "loaded",
+                "gemini_key": "configured",
+                "newsapi_key": "missing",
+                "factcheck_key": "configured"
             }
         }
     }
     docs_url=None,   # we override /docs below with custom settings
 )
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:3000",
+    "http://localhost:5000"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+@app.exception_handler(Exception)
+async def generic_exception_handler(request, exc):
+    """Catch-all exception handler to prevent 500 errors from crashing the server."""
+    return  JSONResponse(status_code=500,
+                        content={"error": "Internal server error",
+                        "detail": str(exc)})
 # ENDPOINTS
     response_description="Service status and version info"
 )
 def health_check():
+    """
+    Check if the NLP service is running and all components are ready.
+    Returns component-level status so the Node backend can make informed decisions.
+    - `bart_model: loaded`  — BART-MNLI is warm in memory (first /verify/deep call triggers load)
+    - `*_key: configured`   — the env var is set (non-empty); does not validate the key
+    - `status: degraded`    — at least one key is missing (Tier 2/3 may fail)
+    """
+    from verifier.tier2_nli import _load_pipeline  # local import to avoid circular
+    bart_status = "loaded" if _load_pipeline.cache_info().currsize > 0 else "not_loaded"
+    gemini_key  = "configured" if os.getenv("GEMINI_API_KEY")            else "missing"
+    newsapi_key = "configured" if os.getenv("NEWS_API_KEY")              else "missing"
+    factcheck   = "configured" if os.getenv("GOOGLE_FACT_CHECK_API_KEY") else "missing"
+    # Degrade if any external API key is missing (Tier 2/3 will silently skip them)
+    keys_ok = all(k == "configured" for k in [gemini_key, newsapi_key, factcheck])
+    overall = "healthy" if keys_ok else "degraded"
     return {
+        "status": overall,
         "service": "B-ware NLP Service",
+        "version": "1.0.0",
+        "bart_model": bart_status,
+        "gemini_key": gemini_key,
+        "newsapi_key": newsapi_key,
+        "factcheck_key": factcheck,
     }
     Returns the `tier_used` field so you know which layer produced the verdict.
     Use `POST /verify/deep` to force all three tiers regardless of early exit conditions.
     """
+    clean_text = preprocess_claim(request.text)
+    try:
+        result: VerificationResult = await asyncio.wait_for(
+            route_verification(clean_text, force_tier3=False),
+            timeout=30.0,
+        )
+    except asyncio.TimeoutError:
+        logger.warning("verify_full timed out for text: %.80s", clean_text)
+        return FullVerificationResult(
+            original_text=clean_text,
+            tier_used="tier1",
+            verdict="unverifiable",
+            confidence=0.0,
+            extracted_metric=None,
+            extracted_value=None,
+            extracted_year=None,
+            extraction_confidence=0.0,
+            evidence=[],
+            explanation="Verification timed out after 30 seconds.",
+            tiers_run=[],
+        )
     return FullVerificationResult(
         original_text=result.original_text,
         tier_used=result.tier_used,
     **Slower** than `/verify` — expect ~3–8 seconds latency (network + LLM).
     Subject to Gemini free-tier rate limits (15 req/min).
     """
+    clean_text = preprocess_claim(request.text)
+    try:
+        result: VerificationResult = await asyncio.wait_for(
+            route_verification(clean_text, force_tier3=True),
+            timeout=30.0,
+        )
+    except asyncio.TimeoutError:
+        logger.warning("verify_deep timed out for text: %.80s", clean_text)
+        return FullVerificationResult(
+            original_text=clean_text,
+            tier_used="tier1",
+            verdict="unverifiable",
+            confidence=0.0,
+            extracted_metric=None,
+            extracted_value=None,
+            extracted_year=None,
+            extraction_confidence=0.0,
+            evidence=[],
+            explanation="Verification timed out after 30 seconds.",
+            tiers_run=[],
+        )
     return FullVerificationResult(
         original_text=result.original_text,
         tier_used=result.tier_used,
 # Run the server directly: python main.py
 if __name__ == "__main__":
     import uvicorn
+    logger.info("Starting B-ware NLP Service...")
+    logger.info("API docs available at: http://localhost:5001/docs")
     uvicorn.run(
         "main:app",
         host="0.0.0.0",

tests/test_tier2_nli.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""
+test_tier2_nli.py — Tests for Tier 2: NLI evidence scoring
+===========================================================
+Run with:  pytest tests/test_tier2_nli.py -v
+WHAT WE'RE TESTING:
+  - Label mapping (_map_label): raw HuggingFace labels → our NLI vocabulary
+  - Empty/short input handling: graceful degradation when no evidence
+  - Aggregation logic: majority voting across multiple snippets
+  - Confidence averaging: math correctness
+WHY WE MOCK:
+  The real NLI pipeline downloads a 1.6GB model from HuggingFace.
+  In tests, we replace _run_nli_sync with a fake that returns
+  predictable results instantly. This way:
+    - Tests run in <1 second (not 30+ seconds for model download)
+    - Tests work offline (no internet needed)
+    - Tests are deterministic (same input → always same output)
+HOW MOCKING WORKS:
+  @patch("verifier.tier2_nli._run_nli_sync")
+  def test_something(self, mock_nli):
+      mock_nli.return_value = {"labels": [...], "scores": [...]}
+  This says: "Wherever _run_nli_sync is called inside tier2_nli.py,
+  don't actually call it — use this fake return value instead."
+  The mock object is passed as the LAST parameter to the test function
+  (after self). If you have multiple @patch decorators, they're passed
+  in REVERSE order (bottom decorator → first parameter).
+"""
+import sys
+import os
+import asyncio
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from unittest.mock import patch, MagicMock
+from verifier.tier2_nli import run_nli, _map_label, Tier2Result, NliResult
+from verifier.evidence_fetcher import EvidenceSnippet
+# =============================================================================
+# HELPER: Create fake evidence snippets for testing
+# =============================================================================
+def _make_snippet(text: str, source: str = "TestSource") -> EvidenceSnippet:
+    """
+    Factory function to create EvidenceSnippet objects for tests.
+    WHY A HELPER?
+      EvidenceSnippet has 6 fields. Writing them out every time is tedious
+      and makes tests harder to read. This helper provides sensible defaults
+      so each test only specifies what matters (the text content).
+    """
+    return EvidenceSnippet(
+        source=source,
+        title=f"Article about {text[:30]}",
+        snippet=text,
+        url="https://example.com/article",
+        published_date="2024-01-15",
+        evidence_type="news",
+    )
+# =============================================================================
+# LABEL MAPPING TESTS
+# =============================================================================
+class TestMapLabel:
+    """
+    _map_label converts HuggingFace's zero-shot labels to our NLI vocabulary.
+    The pipeline returns labels like "supports the claim" but we need
+    standard NLI terms: entailment, contradiction, neutral.
+    """
+    def test_supports_maps_to_entailment(self):
+        """'supports the claim' → 'entailment'"""
+        assert _map_label("supports the claim") == "entailment"
+    def test_contradicts_maps_to_contradiction(self):
+        """'contradicts the claim' → 'contradiction'"""
+        assert _map_label("contradicts the claim") == "contradiction"
+    def test_unrelated_maps_to_neutral(self):
+        """'unrelated to the claim' → 'neutral'"""
+        assert _map_label("unrelated to the claim") == "neutral"
+    def test_unknown_label_maps_to_neutral(self):
+        """Any unrecognized label defaults to 'neutral' (safe fallback)."""
+        assert _map_label("something unexpected") == "neutral"
+    def test_case_insensitive(self):
+        """Label matching should be case-insensitive."""
+        assert _map_label("SUPPORTS the claim") == "entailment"
+        assert _map_label("Contradicts The Claim") == "contradiction"
+# =============================================================================
+# EMPTY / SHORT INPUT TESTS
+# =============================================================================
+class TestRunNliEdgeCases:
+    """
+    Tests for run_nli when input is empty or too short.
+    No mocking needed — these paths never reach the NLI model.
+    """
+    def test_empty_snippets_returns_insufficient_evidence(self):
+        """
+        No evidence at all → verdict should be 'insufficient_evidence'.
+        WHY:
+          If the evidence fetcher found 0 results (API down, no matches),
+          we can't make any NLI judgment. 'insufficient_evidence' tells
+          the verdict router to escalate to Tier 3.
+        """
+        result = asyncio.run(run_nli(claim="GDP grew 7.5%", snippets=[]))
+        assert result.verdict == "insufficient_evidence"
+        assert result.confidence == 0.0
+        assert result.nli_results == []
+        assert result.evidence_count == 0
+    def test_snippets_too_short_are_skipped(self):
+        """
+        Snippets shorter than 10 characters are skipped.
+        WHY:
+          Tiny snippets like "N/A" or "..." would produce garbage NLI scores.
+          The 10-char minimum filters them out. If ALL snippets are too short,
+          we get insufficient_evidence (same as empty).
+        """
+        short_snippets = [
+            _make_snippet("short"),      # 5 chars — skipped
+            _make_snippet("tiny"),       # 4 chars — skipped
+            _make_snippet(""),           # 0 chars — skipped
+        ]
+        result = asyncio.run(run_nli(
+            claim="GDP grew 7.5%",
+            snippets=short_snippets
+        ))
+        assert result.verdict == "insufficient_evidence"
+        assert result.evidence_count == 0
+# =============================================================================
+# AGGREGATION TESTS (with mocked NLI model)
+# =============================================================================
+class TestNliAggregation:
+    """
+    Tests for the majority voting + confidence averaging logic.
+    HOW MAJORITY VOTING WORKS (from tier2_nli.py):
+      1. Each snippet gets an NLI label (entailment/contradiction/neutral)
+      2. We count how many snippets got each label
+      3. The label with the most votes wins
+      4. If tied, the label with the highest total score wins
+      5. Confidence = average score of the winning label's snippets
+    We mock _run_nli_sync to control exactly what the model "returns".
+    """
+    @patch("verifier.tier2_nli._run_nli_sync")
+    def test_entailment_wins_majority(self, mock_nli):
+        """
+        3 out of 5 snippets support the claim → verdict = entailment.
+        WHAT THE MOCK DOES:
+          mock_nli.side_effect = [...] means "return these values in order".
+          First call returns entailment, second returns entailment, etc.
+        """
+        # Simulate 5 NLI calls: 3 support, 1 contradicts, 1 neutral
+        mock_nli.side_effect = [
+            {"labels": ["supports the claim", "contradicts the claim", "unrelated to the claim"],
+             "scores": [0.85, 0.10, 0.05]},
+            {"labels": ["supports the claim", "unrelated to the claim", "contradicts the claim"],
+             "scores": [0.78, 0.15, 0.07]},
+            {"labels": ["supports the claim", "contradicts the claim", "unrelated to the claim"],
+             "scores": [0.92, 0.05, 0.03]},
+            {"labels": ["contradicts the claim", "supports the claim", "unrelated to the claim"],
+             "scores": [0.70, 0.20, 0.10]},
+            {"labels": ["unrelated to the claim", "supports the claim", "contradicts the claim"],
+             "scores": [0.60, 0.25, 0.15]},
+        ]
+        snippets = [_make_snippet(f"Evidence snippet number {i} about GDP growth rates in India")
+                     for i in range(5)]
+        result = asyncio.run(run_nli(claim="India's GDP grew 7.5% in 2024", snippets=snippets))
+        assert result.verdict == "entailment"
+        assert result.evidence_count == 5
+        # Confidence should be average of the 3 entailment scores: (0.85+0.78+0.92)/3
+        expected_conf = round((0.85 + 0.78 + 0.92) / 3, 4)
+        assert result.confidence == expected_conf
+    @patch("verifier.tier2_nli._run_nli_sync")
+    def test_contradiction_wins_majority(self, mock_nli):
+        """
+        Majority of snippets contradict the claim → verdict = contradiction.
+        """
+        mock_nli.side_effect = [
+            {"labels": ["contradicts the claim", "supports the claim", "unrelated to the claim"],
+             "scores": [0.88, 0.08, 0.04]},
+            {"labels": ["contradicts the claim", "unrelated to the claim", "supports the claim"],
+             "scores": [0.75, 0.15, 0.10]},
+            {"labels": ["supports the claim", "contradicts the claim", "unrelated to the claim"],
+             "scores": [0.65, 0.20, 0.15]},
+        ]
+        snippets = [_make_snippet(f"Contradicting evidence {i} about inflation rates")
+                     for i in range(3)]
+        result = asyncio.run(run_nli(claim="Inflation was 4%", snippets=snippets))
+        assert result.verdict == "contradiction"
+        assert result.evidence_count == 3
+    @patch("verifier.tier2_nli._run_nli_sync")
+    def test_single_snippet(self, mock_nli):
+        """
+        Only 1 snippet → that snippet's label becomes the verdict.
+        No voting needed; confidence = that snippet's score.
+        """
+        mock_nli.return_value = {
+            "labels": ["supports the claim", "contradicts the claim", "unrelated to the claim"],
+            "scores": [0.91, 0.06, 0.03],
+        }
+        snippets = [_make_snippet("India's GDP growth exceeded expectations reaching 7.4 percent")]
+        result = asyncio.run(run_nli(claim="GDP was 7.5%", snippets=snippets))
+        assert result.verdict == "entailment"
+        assert result.confidence == 0.91
+        assert result.evidence_count == 1
+        assert len(result.nli_results) == 1
+        assert result.nli_results[0].label == "entailment"

tests/test_tier3_llm.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+test_tier3_llm.py — Tests for Tier 3: LLM reasoning via Gemini
+===============================================================
+Run with:  pytest tests/test_tier3_llm.py -v
+WHAT WE'RE TESTING:
+  - Prompt building: correct structure for all data combinations
+  - Response parsing: valid JSON, markdown-wrapped JSON, garbage input
+  - Graceful degradation: missing API key, API errors, invalid verdicts
+  - End-to-end tier3_llm_check with mocked Gemini responses
+WHY WE TEST PARSING SO HEAVILY:
+  LLMs are unpredictable. Gemini might return:
+    - Clean JSON: {"verdict": "accurate", ...}
+    - Markdown-wrapped: ```json\n{"verdict": "accurate", ...}\n```
+    - With extra text: "Here's my analysis:\n{...}"
+    - Complete garbage: "I cannot determine..."
+  Our parser must handle ALL of these. Each test covers one scenario.
+"""
+import sys
+import os
+import asyncio
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from unittest.mock import patch, AsyncMock
+from verifier.tier3_llm import (
+    _build_prompt,
+    _parse_llm_response,
+    tier3_llm_check,
+    EvidenceSummary,
+    Tier3Result,
+)
+# =============================================================================
+# PROMPT BUILDING TESTS
+# =============================================================================
+class TestBuildPrompt:
+    """
+    _build_prompt constructs the text sent to Gemini.
+    It must include ALL available context and handle missing data gracefully.
+    """
+    def test_full_numeric_data(self):
+        """
+        When we have official values from Tier 1, the prompt should include
+        both the claimed and official numbers plus the percentage error.
+        """
+        prompt = _build_prompt(
+            claim="India's GDP grew 7.5% in 2024",
+            metric="GDP growth rate",
+            claimed_value=7.5,
+            year=2024,
+            official_value=6.49,
+            percentage_error=15.56,
+            official_source="World Bank",
+            evidence_snippets=[],
+        )
+        # Check that key numeric data appears in the prompt
+        assert "7.5" in prompt           # claimed value
+        assert "6.49" in prompt          # official value
+        assert "15.56" in prompt         # percentage error
+        assert "World Bank" in prompt    # source
+        assert "2024" in prompt          # year
+        assert "GDP growth rate" in prompt
+    def test_no_numeric_data(self):
+        """
+        When extraction failed (no metric/value), the prompt should say so
+        rather than crash or show 'None'.
+        """
+        prompt = _build_prompt(
+            claim="The economy is doing great",
+            metric=None,
+            claimed_value=None,
+            year=None,
+            official_value=None,
+            percentage_error=None,
+            official_source=None,
+            evidence_snippets=[],
+        )
+        assert "No numeric data" in prompt
+        assert "None" not in prompt  # Should not leak Python's None into the prompt
+    def test_metric_but_no_official_value(self):
+        """
+        Metric extracted but World Bank has no data.
+        Common for metrics like 'fiscal deficit' where data lags.
+        """
+        prompt = _build_prompt(
+            claim="Fiscal deficit was 5.9% in 2025",
+            metric="fiscal deficit",
+            claimed_value=5.9,
+            year=2025,
+            official_value=None,
+            percentage_error=None,
+            official_source=None,
+            evidence_snippets=[],
+        )
+        assert "No official numeric data available" in prompt
+        assert "5.9" in prompt
+        assert "2025" in prompt
+    def test_evidence_snippets_included(self):
+        """
+        When evidence snippets exist, they should appear numbered in the prompt.
+        """
+        snippets = [
+            EvidenceSummary(
+                source="Reuters", snippet="India GDP grew 6.5% in 2024",
+                url="https://reuters.com/article", evidence_type="news"
+            ),
+            EvidenceSummary(
+                source="AFP Fact Check", snippet="Claim of 7.5% GDP is misleading",
+                url="https://factcheck.afp.com/123", evidence_type="fact_check"
+            ),
+        ]
+        prompt = _build_prompt(
+            claim="GDP grew 7.5%", metric="GDP growth rate",
+            claimed_value=7.5, year=2024,
+            official_value=6.49, percentage_error=15.56,
+            official_source="World Bank",
+            evidence_snippets=snippets,
+        )
+        assert "Reuters" in prompt
+        assert "AFP Fact Check" in prompt
+        assert "[1]" in prompt  # Numbered evidence
+        assert "[2]" in prompt
+        assert "NEWS" in prompt        # evidence_type shown in uppercase
+        assert "FACT_CHECK" in prompt
+    def test_prompt_has_verdict_definitions(self):
+        """
+        The prompt must always include verdict definitions so the LLM
+        knows our exact thresholds and vocabulary.
+        """
+        prompt = _build_prompt(
+            claim="test", metric=None, claimed_value=None, year=None,
+            official_value=None, percentage_error=None,
+            official_source=None, evidence_snippets=[],
+        )
+        assert "accurate" in prompt
+        assert "misleading" in prompt
+        assert "false" in prompt
+        assert "unverifiable" in prompt
+        assert "RESPOND WITH ONLY VALID JSON" in prompt
+# =============================================================================
+# RESPONSE PARSING TESTS
+# =============================================================================
+class TestParseResponse:
+    """
+    _parse_llm_response must extract valid JSON from messy LLM output.
+    WHY THIS IS CRITICAL:
+      If parsing fails, tier3_llm_check returns "unverifiable" — which means
+      we wasted an API call and the user gets no useful answer. Every edge
+      case we handle here = fewer false "unverifiable" results.
+    """
+    def test_clean_json(self):
+        """Perfect JSON — the happy path."""
+        raw = '{"verdict": "accurate", "confidence": 0.92, "explanation": "Data matches.", "sources_used": ["World Bank"]}'
+        result = _parse_llm_response(raw)
+        assert result is not None
+        assert result["verdict"] == "accurate"
+        assert result["confidence"] == 0.92
+        assert result["explanation"] == "Data matches."
+        assert result["sources_used"] == ["World Bank"]
+    def test_markdown_wrapped_json(self):
+        """
+        LLMs often wrap JSON in ```json ... ``` markdown blocks.
+        Our parser strips these wrappers.
+        """
+        raw = '```json\n{"verdict": "misleading", "confidence": 0.71, "explanation": "Error is 15%.", "sources_used": []}\n```'
+        result = _parse_llm_response(raw)
+        assert result is not None
+        assert result["verdict"] == "misleading"
+    def test_json_with_extra_text(self):
+        """
+        LLM adds conversational text before/after the JSON.
+        Our regex extracts the {...} block.
+        """
+        raw = 'Here is my analysis:\n\n{"verdict": "false", "confidence": 0.88, "explanation": "Clearly wrong.", "sources_used": ["Reuters"]}\n\nHope this helps!'
+        result = _parse_llm_response(raw)
+        assert result is not None
+        assert result["verdict"] == "false"
+    def test_garbage_input_returns_none(self):
+        """Completely unparseable text → None."""
+        assert _parse_llm_response("I cannot determine the accuracy.") is None
+        assert _parse_llm_response("") is None
+        assert _parse_llm_response(None) is None
+    def test_invalid_json_returns_none(self):
+        """Malformed JSON (missing quotes, trailing commas) → None."""
+        raw = '{verdict: accurate, confidence: 0.9}'  # missing quotes
+        assert _parse_llm_response(raw) is None
+# =============================================================================
+# END-TO-END TIER 3 TESTS (with mocked Gemini API)
+# =============================================================================
+class TestTier3LlmCheck:
+    """
+    Tests for the full tier3_llm_check function.
+    MOCKING STRATEGY:
+      We mock _call_gemini (the HTTP call to Gemini) not the whole function.
+      This way we still test:
+        - Prompt building
+        - Response parsing
+        - Verdict normalization
+        - Confidence clamping
+      Only the actual HTTP call is faked.
+    AsyncMock vs MagicMock:
+      _call_gemini is an async function (async def _call_gemini).
+      Regular MagicMock doesn't work with `await`. AsyncMock does.
+    """
+    @patch("verifier.tier3_llm._call_gemini", new_callable=AsyncMock)
+    def test_successful_analysis(self, mock_gemini):
+        """Happy path: Gemini returns valid JSON."""
+        mock_gemini.return_value = '{"verdict": "accurate", "confidence": 0.95, "explanation": "The claimed GDP growth of 7.5% matches World Bank data.", "sources_used": ["World Bank"]}'
+        result = asyncio.run(tier3_llm_check(
+            claim="GDP grew 7.5% in 2024",
+            metric="GDP growth rate",
+            claimed_value=7.5,
+            year=2024,
+            official_value=7.48,
+            percentage_error=0.27,
+            official_source="World Bank",
+        ))
+        assert isinstance(result, Tier3Result)
+        assert result.verdict == "accurate"
+        assert result.confidence == 0.95
+        assert "World Bank" in result.sources_used
+    @patch("verifier.tier3_llm._call_gemini", new_callable=AsyncMock)
+    def test_no_api_key(self, mock_gemini):
+        """
+        When GEMINI_API_KEY is not set, _call_gemini returns None.
+        tier3_llm_check should return 'unverifiable' gracefully.
+        WHY THIS MATTERS:
+          In development, your teammates might not have a Gemini key.
+          The system should degrade gracefully, not crash.
+        """
+        mock_gemini.return_value = None  # simulates no API key
+        result = asyncio.run(tier3_llm_check(claim="GDP grew 7.5%"))
+        assert result.verdict == "unverifiable"
+        assert result.confidence == 0.0
+        assert "unavailable" in result.explanation.lower() or "unparseable" in result.explanation.lower()
+    @patch("verifier.tier3_llm._call_gemini", new_callable=AsyncMock)
+    def test_invalid_verdict_normalized(self, mock_gemini):
+        """
+        If Gemini returns a verdict not in our vocabulary (e.g., "partially true"),
+        it should be normalized to 'unverifiable'.
+        WHY:
+          The frontend expects exactly 4 verdict strings for color coding.
+          Any other string would break the UI.
+        """
+        mock_gemini.return_value = '{"verdict": "partially true", "confidence": 0.7, "explanation": "Some parts are right.", "sources_used": []}'
+        result = asyncio.run(tier3_llm_check(claim="GDP grew 7.5%"))
+        assert result.verdict == "unverifiable"  # normalized from "partially true"
+    @patch("verifier.tier3_llm._call_gemini", new_callable=AsyncMock)
+    def test_confidence_clamped_to_range(self, mock_gemini):
+        """
+        If Gemini returns confidence > 1.0 or < 0.0, it should be clamped.
+        WHY:
+          LLMs sometimes return 95 instead of 0.95, or -0.1 for uncertainty.
+          Clamping to [0.0, 1.0] prevents UI bugs (progress bars overflowing, etc).
+        """
+        mock_gemini.return_value = '{"verdict": "accurate", "confidence": 95.0, "explanation": "Sure.", "sources_used": []}'
+        result = asyncio.run(tier3_llm_check(claim="GDP grew 7.5%"))
+        assert result.confidence == 1.0  # clamped from 95.0
+    @patch("verifier.tier3_llm._call_gemini", new_callable=AsyncMock)
+    def test_unparseable_response(self, mock_gemini):
+        """Gemini returns non-JSON text → graceful degradation."""
+        mock_gemini.return_value = "I'm sorry, I cannot verify economic claims."
+        result = asyncio.run(tier3_llm_check(claim="GDP grew 7.5%"))
+        assert result.verdict == "unverifiable"
+        assert result.raw_response == "I'm sorry, I cannot verify economic claims."

tests/test_verdict_router.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+test_verdict_router.py — Tests for the RAV engine orchestrator
+==============================================================
+Run with:  pytest tests/test_verdict_router.py -v
+WHAT WE'RE TESTING:
+  - Verdict rule functions: _verdict_from_error, _nli_to_verdict
+  - Routing logic: when does Tier 1 short-circuit? When does it escalate?
+  - force_tier3: does it bypass all early returns?
+MOCKING STRATEGY:
+  We mock ALL three tier functions + the extractor + evidence fetcher.
+  This isolates the ROUTING LOGIC from the actual verification logic.
+  Think of it like testing a traffic signal controller:
+  - We don't care if the roads actually have cars
+  - We care that the lights change at the right times
+  - So we simulate "cars detected" and check which light turns green
+MOCK HIERARCHY (what calls what):
+  route_verification
+    ├── extract_all            ← mocked (no regex needed)
+    ├── tier1_numeric_check    ← mocked (no World Bank API call)
+    ├── fetch_evidence         ← mocked (no NewsAPI/Google call)
+    ├── run_nli                ← mocked (no BART model)
+    └── tier3_llm_check        ← mocked (no Gemini call)
+"""
+import sys
+import os
+import asyncio
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from unittest.mock import patch, AsyncMock, MagicMock
+from verifier.verdict_router import (
+    _verdict_from_error,
+    _nli_to_verdict,
+    route_verification,
+    VerificationResult,
+    TIER1_ERROR_CLEAR_LOW,
+    TIER1_ERROR_CLEAR_HIGH,
+    TIER2_CONFIDENCE_MIN,
+)
+from verifier.tier1_numeric import WorldBankNumericCheck
+from verifier.tier2_nli import Tier2Result, NliResult
+from verifier.tier3_llm import Tier3Result
+from verifier.evidence_fetcher import EvidenceSnippet
+# =============================================================================
+# VERDICT RULE TESTS (pure functions, no mocking needed)
+# =============================================================================
+class TestVerdictFromError:
+    """
+    _verdict_from_error maps percentage_error → verdict string.
+    These are the SAME thresholds used in /verify/quick.
+    By centralizing them in verdict_router.py, we ensure consistency
+    across all endpoints.
+    """
+    def test_none_returns_unverifiable(self):
+        """No data at all → can't make a judgment."""
+        assert _verdict_from_error(None) == "unverifiable"
+    def test_zero_error_is_accurate(self):
+        """0% error = exact match = accurate."""
+        assert _verdict_from_error(0.0) == "accurate"
+    def test_below_5_is_accurate(self):
+        """4.99% error → within tolerance → accurate."""
+        assert _verdict_from_error(4.99) == "accurate"
+    def test_exactly_5_is_misleading(self):
+        """5.0% is AT the boundary → misleading (not accurate)."""
+        assert _verdict_from_error(5.0) == "misleading"
+    def test_between_5_and_20_is_misleading(self):
+        """15% error → misleading range."""
+        assert _verdict_from_error(15.0) == "misleading"
+    def test_exactly_20_is_false(self):
+        """20.0% is AT the boundary → false."""
+        assert _verdict_from_error(20.0) == "false"
+    def test_above_20_is_false(self):
+        """50% error → clearly false."""
+        assert _verdict_from_error(50.0) == "false"
+class TestNliToVerdict:
+    """
+    _nli_to_verdict maps NLI aggregated labels to our verdict vocabulary.
+    WHY THE MAPPING EXISTS:
+      NLI models speak in terms of entailment/contradiction/neutral.
+      Our API speaks in terms of accurate/false/unverifiable.
+      This function is the translation layer.
+    """
+    def test_entailment_maps_to_accurate(self):
+        assert _nli_to_verdict("entailment") == "accurate"
+    def test_contradiction_maps_to_false(self):
+        assert _nli_to_verdict("contradiction") == "false"
+    def test_neutral_maps_to_unverifiable(self):
+        assert _nli_to_verdict("neutral") == "unverifiable"
+    def test_insufficient_evidence_maps_to_unverifiable(self):
+        assert _nli_to_verdict("insufficient_evidence") == "unverifiable"
+    def test_unknown_maps_to_unverifiable(self):
+        """Safety net: any unrecognized label → unverifiable."""
+        assert _nli_to_verdict("something_weird") == "unverifiable"
+# =============================================================================
+# ROUTING LOGIC TESTS (full mocking)
+# =============================================================================
+# Helper: create a standard extraction result
+def _fake_extraction(metric="GDP growth rate", value=7.5, year=2024, confidence=0.9):
+    return {
+        "original_text": f"Claims {metric} was {value} in {year}",
+        "metric": metric, "value": value, "year": year,
+        "confidence": confidence,
+    }
+# Helper: create a Tier 1 result
+def _fake_t1(official_value=6.49, percentage_error=15.56):
+    return WorldBankNumericCheck(
+        official_value=official_value, claimed_value=7.5,
+        percentage_error=percentage_error, source="World Bank",
+        indicator_code="NY.GDP.MKTP.KD.ZG",
+        source_url="https://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG?locations=IN",
+        year=2024,
+    )
+class TestRoutingLogic:
+    """
+    Tests for the main route_verification function.
+    IMPORTANT: We patch at the IMPORT PATH, not the definition path.
+    verdict_router.py does: from extractor import extract_all
+    So we patch "verifier.verdict_router.extract_all" not "extractor.extract_all".
+    This is a common pytest gotcha!
+    """
+    @patch("verifier.verdict_router.tier1_numeric_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.extract_all")
+    def test_tier1_fast_path_accurate(self, mock_extract, mock_t1):
+        """
+        SCENARIO: High extraction confidence + low error → Tier 1 alone is enough.
+        Conditions for fast path (all must be true):
+          1. official_value is not None (World Bank returned data)
+          2. percentage_error < 5% OR >= 20% (clear-cut case)
+          3. extraction confidence >= 0.8
+          4. force_tier3 is False
+        Result: Returns immediately with tier_used="tier1", skips Tier 2/3.
+        """
+        mock_extract.return_value = _fake_extraction(confidence=0.9)
+        mock_t1.return_value = _fake_t1(official_value=7.48, percentage_error=0.27)
+        result = asyncio.run(route_verification("GDP grew 7.5% in 2024"))
+        assert result.tier_used == "tier1"
+        assert result.verdict == "accurate"  # 0.27% error < 5%
+        assert result.tiers_run == ["tier1"]
+        assert result.evidence == []  # No evidence fetched
+    @patch("verifier.verdict_router.tier1_numeric_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.extract_all")
+    def test_tier1_fast_path_false(self, mock_extract, mock_t1):
+        """
+        SCENARIO: Clear error >= 20% → Tier 1 says 'false', no escalation.
+        """
+        mock_extract.return_value = _fake_extraction(confidence=0.9)
+        mock_t1.return_value = _fake_t1(official_value=5.0, percentage_error=50.0)
+        result = asyncio.run(route_verification("GDP grew 7.5% in 2024"))
+        assert result.tier_used == "tier1"
+        assert result.verdict == "false"  # 50% error >= 20%
+    @patch("verifier.verdict_router.run_nli", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.fetch_evidence", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.tier1_numeric_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.extract_all")
+    def test_escalates_to_tier2_ambiguous_error(self, mock_extract, mock_t1, mock_evidence, mock_nli):
+        """
+        SCENARIO: Error is 15% (ambiguous zone 5-20%) → escalates to Tier 2.
+        Tier 2 is confident (0.72 >= 0.6) → returns merged result.
+        WHY ESCALATION:
+          15% error is in the "misleading" zone, but we're not 100% sure.
+          Maybe the World Bank data is outdated, or the metric was misidentified.
+          Tier 2 checks news evidence to build more confidence.
+        """
+        mock_extract.return_value = _fake_extraction(confidence=0.9)
+        mock_t1.return_value = _fake_t1(official_value=6.49, percentage_error=15.56)
+        mock_evidence.return_value = [
+            EvidenceSnippet(source="Reuters", title="GDP Report",
+                          snippet="India GDP grew at 6.5 percent in fiscal 2024",
+                          url="https://reuters.com", published_date="2024-06-01",
+                          evidence_type="news"),
+        ]
+        mock_nli.return_value = Tier2Result(
+            verdict="contradiction", confidence=0.72,
+            nli_results=[NliResult(label="contradiction", score=0.72,
+                                   snippet_source="Reuters",
+                                   snippet_text="India GDP grew at 6.5 percent")],
+            evidence_count=1,
+        )
+        result = asyncio.run(route_verification("GDP grew 7.5% in 2024"))
+        assert result.tier_used == "tier2"
+        assert result.tiers_run == ["tier1", "tier2"]
+        assert len(result.evidence) == 1
+        # Numeric verdict (misleading) overrides NLI because we have official data
+        assert result.verdict == "misleading"
+    @patch("verifier.verdict_router.tier3_llm_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.run_nli", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.fetch_evidence", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.tier1_numeric_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.extract_all")
+    def test_escalates_to_tier3_low_nli_confidence(
+        self, mock_extract, mock_t1, mock_evidence, mock_nli, mock_t3
+    ):
+        """
+        SCENARIO: Tier 2 confidence < 0.6 → escalates to Tier 3.
+        This happens when evidence snippets are mixed or irrelevant,
+        so the NLI model can't reach a confident conclusion.
+        """
+        mock_extract.return_value = _fake_extraction(confidence=0.9)
+        mock_t1.return_value = _fake_t1(official_value=6.49, percentage_error=15.56)
+        mock_evidence.return_value = []
+        mock_nli.return_value = Tier2Result(
+            verdict="neutral", confidence=0.35,  # <0.6 threshold
+            nli_results=[], evidence_count=0,
+        )
+        mock_t3.return_value = Tier3Result(
+            verdict="misleading", confidence=0.82,
+            explanation="The claimed 7.5% exceeds the World Bank figure of 6.49%.",
+            sources_used=["World Bank"],
+            raw_response="...",
+        )
+        result = asyncio.run(route_verification("GDP grew 7.5% in 2024"))
+        assert result.tier_used == "tier3"
+        assert result.tiers_run == ["tier1", "tier2", "tier3"]
+        assert result.verdict == "misleading"
+        assert result.confidence == 0.82
+    @patch("verifier.verdict_router.tier3_llm_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.run_nli", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.fetch_evidence", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.tier1_numeric_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.extract_all")
+    def test_force_tier3_bypasses_early_returns(
+        self, mock_extract, mock_t1, mock_evidence, mock_nli, mock_t3
+    ):
+        """
+        SCENARIO: force_tier3=True (from /verify/deep endpoint).
+        Even though Tier 1 has a decisive result (0.27% error, clearly accurate),
+        we force execution through ALL tiers because the user explicitly
+        requested deep analysis.
+        """
+        mock_extract.return_value = _fake_extraction(confidence=0.9)
+        # Tier 1 is decisive (would normally short-circuit)
+        mock_t1.return_value = _fake_t1(official_value=7.48, percentage_error=0.27)
+        mock_evidence.return_value = []
+        mock_nli.return_value = Tier2Result(
+            verdict="entailment", confidence=0.85,
+            nli_results=[], evidence_count=0,
+        )
+        mock_t3.return_value = Tier3Result(
+            verdict="accurate", confidence=0.96,
+            explanation="All sources confirm the claim.",
+            sources_used=["World Bank"], raw_response="...",
+        )
+        result = asyncio.run(route_verification(
+            "GDP grew 7.5% in 2024", force_tier3=True
+        ))
+        assert result.tier_used == "tier3"  # NOT tier1, even though it was decisive
+        assert result.tiers_run == ["tier1", "tier2", "tier3"]  # ALL tiers ran
+    @patch("verifier.verdict_router.tier1_numeric_check", new_callable=AsyncMock)
+    @patch("verifier.verdict_router.extract_all")
+    def test_low_extraction_confidence_skips_fast_path(self, mock_extract, mock_t1):
+        """
+        SCENARIO: extraction confidence = 0.6 (below 0.8 threshold).
+        Even though Tier 1 error is clear (<5%), low extraction confidence
+        means we might have the WRONG metric. So we don't trust Tier 1
+        alone and escalate to Tier 2 for evidence-based backup.
+        This is controlled by TIER1_STRONG_THRESHOLD = 0.8 in verdict_router.py.
+        """
+        mock_extract.return_value = _fake_extraction(confidence=0.6)  # Below 0.8
+        mock_t1.return_value = _fake_t1(official_value=7.48, percentage_error=0.27)
+        # Since this will try to go to Tier 2, we need those mocks too
+        with patch("verifier.verdict_router.fetch_evidence", new_callable=AsyncMock) as mock_ev, \
+             patch("verifier.verdict_router.run_nli", new_callable=AsyncMock) as mock_nli:
+            mock_ev.return_value = []
+            mock_nli.return_value = Tier2Result(
+                verdict="entailment", confidence=0.75,
+                nli_results=[], evidence_count=0,
+            )
+            result = asyncio.run(route_verification("GDP grew 7.5% in 2024"))
+        assert result.tier_used != "tier1"  # Did NOT take the fast path
+        assert "tier2" in result.tiers_run

verifier/__init__.py CHANGED Viewed

@@ -13,9 +13,54 @@ from .tier1_numeric import (
     tier1_numeric_check,
 )
 __all__ = [
     "METRIC_TO_WORLD_BANK_INDICATOR",
     "WorldBankNumericCheck",
     "fetch_world_bank_series",
     "tier1_numeric_check",
 ]

     tier1_numeric_check,
 )
+from .tier2_nli import (
+    NliResult,
+    Tier2Result,
+    run_nli,
+    # _run_nli_sync,   # Not exported since it's an internal helper for the async wrapper. Leading _ mean it's a private function not intended for external use.
+)
+from .tier3_llm import (
+    EvidenceSummary,
+    Tier3Result,
+    tier3_llm_check,
+)
+from .evidence_fetcher import (
+    EvidenceSnippet,
+    fetch_evidence,
+    fetch_google_fact_checks,
+    fetch_news_snippets,
+)
+from .verdict_router import (
+    route_verification,
+    VerificationResult,
+    EvidenceItem,
+)
 __all__ = [
     "METRIC_TO_WORLD_BANK_INDICATOR",
     "WorldBankNumericCheck",
     "fetch_world_bank_series",
     "tier1_numeric_check",
+    # Tier 2
+    "EvidenceSnippet",
+    "fetch_evidence",
+    "fetch_google_fact_checks",
+    "fetch_news_snippets",
+    "NliResult",
+    "Tier2Result",
+    "run_nli",
+    # Tier 3
+    "EvidenceSummary",
+    "Tier3Result",
+    "tier3_llm_check",
+    # Router
+    "VerificationResult",
+    "EvidenceItem",
+    "route_verification",
 ]

verifier/tier2_nli.py CHANGED Viewed

@@ -17,11 +17,14 @@ for higher accuracy when needed.
 from __future__ import annotations
 import asyncio
 from dataclasses import dataclass
 from functools import lru_cache
 from verifier.evidence_fetcher import EvidenceSnippet
 MODEL_NAME = "facebook/bart-large-mnli"
 # To upgrade quality later, change to:
@@ -55,13 +58,13 @@ def _load_pipeline():
     Downloads the model from HuggingFace on first run (~1.6GB).
     """
     from transformers import pipeline
-    print(f"[Tier2 NLI] Loading model: {MODEL_NAME}  (first call only)...")
     nli_pipeline = pipeline(
         "zero-shot-classification",
         model=MODEL_NAME,
         device=-1,  # -1 = CPU; change to 0 for GPU
     )
-    print("[Tier2 NLI] Model loaded and ready.")
     return nli_pipeline

 from __future__ import annotations
 import asyncio
+import logging
 from dataclasses import dataclass
 from functools import lru_cache
 from verifier.evidence_fetcher import EvidenceSnippet
+logger = logging.getLogger("bware.nlp.tier2")
 MODEL_NAME = "facebook/bart-large-mnli"
 # To upgrade quality later, change to:
     Downloads the model from HuggingFace on first run (~1.6GB).
     """
     from transformers import pipeline
+    logger.info("Loading NLI model: %s  (first call only...)", MODEL_NAME)
     nli_pipeline = pipeline(
         "zero-shot-classification",
         model=MODEL_NAME,
         device=-1,  # -1 = CPU; change to 0 for GPU
     )
+    logger.info("NLI model loaded and ready.")
     return nli_pipeline