Spaces:

Vi0509
/

kaeva-factcheck

Runtime error

App Files Files Community

Vi0509 commited on Feb 26

Commit

76bf8b6

verified ·

1 Parent(s): a79cdd9

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +293 -0

app.py ADDED Viewed

	@@ -0,0 +1,293 @@

+#!/usr/bin/env python3
+"""
+Kaeva Fact-Check API — HuggingFace Space
+Two-stage pipeline:
+  Stage 1: DeBERTa-v3-base binary classifier (local, fast, free)
+  Stage 2: Gemini 2.0 Flash + Google Search grounding (cited evidence)
+"""
+import os
+import json
+import time
+import logging
+import urllib.request
+from typing import Optional
+import torch
+import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+log = logging.getLogger("factcheck")
+# ============================================================
+# CONFIG
+# ============================================================
+MODEL_ID = "Vi0509/kaeva-factcheck-deberta"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+CONFIDENCE_THRESHOLD = 0.65
+# GCP Auth — service account JSON stored as HF Space secret
+GCP_SA_JSON = os.environ.get("GCP_SERVICE_ACCOUNT_JSON", "")
+GCP_PROJECT = "eastern-flight-477705-n0"
+_cached_token = {"token": None, "expiry": 0}
+def get_gcp_token():
+    """Get OAuth2 token from service account, with caching."""
+    import time as _time
+    if _cached_token["token"] and _time.time() < _cached_token["expiry"] - 60:
+        return _cached_token["token"]
+    if not GCP_SA_JSON:
+        return None
+    try:
+        from google.oauth2 import service_account
+        from google.auth.transport.requests import Request
+        import tempfile
+        # Write SA JSON to temp file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+            f.write(GCP_SA_JSON)
+            sa_path = f.name
+        creds = service_account.Credentials.from_service_account_file(
+            sa_path, scopes=["https://www.googleapis.com/auth/cloud-platform",
+                             "https://www.googleapis.com/auth/generative-language"])
+        creds.refresh(Request())
+        os.unlink(sa_path)
+        _cached_token["token"] = creds.token
+        _cached_token["expiry"] = creds.expiry.timestamp() if creds.expiry else _time.time() + 3500
+        return creds.token
+    except Exception as e:
+        log.error(f"GCP auth error: {e}")
+        return None
+# ============================================================
+# STAGE 1: DeBERTa Classifier
+# ============================================================
+log.info(f"Loading DeBERTa model on {DEVICE}...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID).to(DEVICE)
+model.eval()
+log.info("DeBERTa loaded.")
+def classify_claim(text: str) -> dict:
+    """Stage 1: Fast binary classification."""
+    inputs = tokenizer(text, truncation=True, max_length=256, padding="max_length", return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+        probs = torch.softmax(logits, dim=-1)[0]
+    real_prob = probs[0].item()
+    fake_prob = probs[1].item()
+    return {
+        "label": "REAL" if real_prob > fake_prob else "FAKE",
+        "confidence": max(real_prob, fake_prob),
+        "real_score": real_prob,
+        "fake_score": fake_prob,
+    }
+# ============================================================
+# STAGE 2: Gemini + Google Search Grounding
+# ============================================================
+GEMINI_PROMPT = """You are a fact-checker. Analyze the following claim using the search results provided.
+CLAIM: "{claim}"
+Instructions:
+1. Determine if the claim is TRUE, FALSE, PARTIALLY TRUE, or UNVERIFIABLE
+2. Cite specific sources that support or refute the claim
+3. Provide a brief explanation (2-3 sentences)
+4. Rate your confidence (0.0 to 1.0)
+Respond in this exact JSON format:
+{{
+  "verdict": "TRUE|FALSE|PARTIALLY TRUE|UNVERIFIABLE",
+  "confidence": 0.0-1.0,
+  "explanation": "Brief explanation with evidence",
+  "key_finding": "One-sentence summary"
+}}"""
+def gemini_verify(claim: str) -> dict:
+    """Stage 2: Gemini with Google Search grounding via service account."""
+    token = get_gcp_token()
+    if not token:
+        return {"error": "GCP credentials not configured", "verdict": "UNVERIFIABLE"}
+    url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
+    payload = {
+        "contents": [{"parts": [{"text": GEMINI_PROMPT.format(claim=claim)}]}],
+        "tools": [{"googleSearch": {}}],
+        "generationConfig": {"temperature": 0.1, "maxOutputTokens": 1024}
+    }
+    req = urllib.request.Request(url,
+        data=json.dumps(payload).encode(),
+        headers={"Authorization": f"Bearer {token}", "Content-Type": "application/json",
+                 "x-goog-user-project": GCP_PROJECT},
+        method="POST")
+    try:
+        resp = urllib.request.urlopen(req, timeout=30)
+        data = json.loads(resp.read())
+        candidate = data["candidates"][0]
+        text = candidate["content"]["parts"][0]["text"]
+        # Parse JSON from response
+        try:
+            # Strip markdown code blocks if present
+            clean = text.strip()
+            if clean.startswith("```"):
+                clean = clean.split("\n", 1)[1].rsplit("```", 1)[0]
+            result = json.loads(clean)
+        except json.JSONDecodeError:
+            result = {"verdict": "UNVERIFIABLE", "explanation": text, "confidence": 0.5}
+        # Extract grounding sources
+        grounding = candidate.get("groundingMetadata", {})
+        sources = []
+        for chunk in grounding.get("groundingChunks", []):
+            web = chunk.get("web", {})
+            if web.get("uri"):
+                sources.append({"title": web.get("title", ""), "url": web["uri"]})
+        result["sources"] = sources[:10]
+        result["search_queries"] = [
+            q.get("searchQuery", "")
+            for q in grounding.get("webSearchQueries", [])
+        ]
+        return result
+    except urllib.error.HTTPError as e:
+        error_body = e.read().decode()[:500]
+        log.error(f"Gemini API error {e.code}: {error_body}")
+        return {"error": f"Gemini API error {e.code}", "verdict": "UNVERIFIABLE"}
+    except Exception as e:
+        log.error(f"Gemini error: {e}")
+        return {"error": str(e), "verdict": "UNVERIFIABLE"}
+# ============================================================
+# COMBINED PIPELINE
+# ============================================================
+def fact_check(claim: str, force_search: bool = False) -> dict:
+    """Full two-stage fact-check pipeline."""
+    if not claim or len(claim.strip()) < 10:
+        return {"error": "Claim too short. Provide a meaningful statement to verify."}
+    start = time.time()
+    # Stage 1: DeBERTa
+    stage1 = classify_claim(claim)
+    result = {
+        "claim": claim,
+        "stage1_classifier": stage1,
+        "pipeline": "classifier_only",
+        "processing_time_ms": 0,
+    }
+    # Stage 2: If suspicious or low confidence, verify with Gemini
+    needs_verification = (
+        force_search or
+        stage1["label"] == "FAKE" or
+        stage1["confidence"] < CONFIDENCE_THRESHOLD
+    )
+    if needs_verification and GCP_SA_JSON:
+        stage2 = gemini_verify(claim)
+        result["stage2_gemini"] = stage2
+        result["pipeline"] = "classifier + gemini_search"
+        # Final verdict combines both stages
+        if stage2.get("verdict") and stage2["verdict"] != "UNVERIFIABLE":
+            result["final_verdict"] = stage2["verdict"]
+            result["final_confidence"] = stage2.get("confidence", stage1["confidence"])
+        else:
+            result["final_verdict"] = stage1["label"]
+            result["final_confidence"] = stage1["confidence"]
+    else:
+        result["final_verdict"] = stage1["label"]
+        result["final_confidence"] = stage1["confidence"]
+    result["processing_time_ms"] = round((time.time() - start) * 1000)
+    return result
+# ============================================================
+# GRADIO UI
+# ============================================================
+def gradio_check(claim: str, force_gemini: bool) -> str:
+    result = fact_check(claim, force_search=force_gemini)
+    return json.dumps(result, indent=2)
+with gr.Blocks(title="Kaeva Fact-Check", theme=gr.themes.Base()) as demo:
+    gr.Markdown("""
+    # 🔍 Kaeva Fact-Check
+    **Two-stage AI fact-checking pipeline**
+    - **Stage 1:** DeBERTa classifier — instant binary detection (real vs fake)
+    - **Stage 2:** Gemini 2.0 Flash + Google Search — live evidence with cited sources
+    """)
+    with gr.Row():
+        with gr.Column(scale=3):
+            claim_input = gr.Textbox(
+                label="Enter a claim to verify",
+                placeholder="e.g., The Great Wall of China is visible from space.",
+                lines=3
+            )
+            force_search = gr.Checkbox(label="Force Google Search verification (bypass classifier)", value=False)
+            check_btn = gr.Button("🔍 Fact-Check", variant="primary", size="lg")
+        with gr.Column(scale=4):
+            output = gr.JSON(label="Result")
+    gr.Examples(
+        examples=[
+            ["The Earth is flat.", False],
+            ["Water boils at 100 degrees Celsius at sea level.", False],
+            ["COVID-19 vaccines contain microchips.", True],
+            ["The speed of light is approximately 300,000 km/s.", False],
+            ["Drinking bleach cures diseases.", True],
+        ],
+        inputs=[claim_input, force_search],
+    )
+    check_btn.click(fn=fact_check, inputs=[claim_input, force_search], outputs=output)
+# ============================================================
+# API ENDPOINT
+# ============================================================
+app = gr.mount_gradio_app(gr.routes.App(), demo, path="/")
+# FastAPI additional routes
+from fastapi import FastAPI
+api = FastAPI()
+@api.post("/api/check")
+async def api_check(request: dict):
+    claim = request.get("claim", "")
+    force = request.get("force_search", False)
+    return fact_check(claim, force_search=force)
+@api.post("/api/batch")
+async def api_batch(request: dict):
+    claims = request.get("claims", [])
+    results = [fact_check(c) for c in claims[:20]]  # Max 20 per batch
+    return {"results": results}
+@api.get("/api/health")
+async def health():
+    return {"status": "ok", "model": MODEL_ID, "device": str(DEVICE)}
+demo.launch(server_name="0.0.0.0", server_port=7860)