Spaces:

mdAmin313
/

lumen

Sleeping

App Files Files Community

Update app.py

by ANISA09 - opened Sep 16, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+437

-309

Files changed (1) hide show

app.py +437 -309

app.py CHANGED Viewed

@@ -1,384 +1,512 @@
 import os
-import json
 import re
 from typing import List, Dict, Any, Optional
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import HTMLResponse
-from pydantic import BaseModel
-from dotenv import load_dotenv
-import requests
-from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
-from fastapi.middleware.cors import CORSMiddleware
-# ---------------- Lazy-loaded AI Models ----------------
 ZS_PIPE = None
-SENTE_MODEL = None
 GEMINI_CLIENT = None
 def get_zs_pipe():
     global ZS_PIPE
     if ZS_PIPE is None:
         try:
             from transformers import pipeline
-            # much smaller model (~250MB vs 1.3GB)
             ZS_PIPE = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")
-        except Exception:
             ZS_PIPE = None
     return ZS_PIPE
 def get_sente_model():
-    global SENTE_MODEL
-    if SENTE_MODEL is None:
         try:
             from sentence_transformers import SentenceTransformer
-            # smaller semantic similarity model (~80MB vs 400MB)
-            SENTE_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
-        except Exception:
-            SENTE_MODEL = None
-    return SENTE_MODEL
 def get_gemini_client():
     global GEMINI_CLIENT
-    if GEMINI_CLIENT is None:
         try:
             from google import genai
-            GEMINI_CLIENT = genai.Client()  # uses GEMINI_API_KEY from environment
-        except Exception:
             GEMINI_CLIENT = None
     return GEMINI_CLIENT
-# ---------------- Env Vars ----------------
-load_dotenv()
-GNEWS_API_KEY = os.getenv("GNEWS_KEY")
-NEWSORG_API_KEY = os.getenv("NEWSORG_KEY")
-GEMINI_API_KEY = os.getenv("AI_API_KEY")
-app = FastAPI(title="Hybrid Misinformation Detector")
-# Define allowed origins
-origins = ["*"]
-# Add CORS middleware
-app.add_middleware(
-CORSMiddleware,
-allow_origins=origins, # List of allowed origins
-allow_credentials=True, # Allow cookies and credentials
-allow_methods=["*"], # Allow all HTTP methods (GET, POST, etc.)
-allow_headers=["*"], # Allow all headers
-)
-# ---------------- Models ----------------
-class VerifyRequest(BaseModel):
-    text: str
-    mode: Optional[str] = "fast"  # fast, deep, hybrid
-# ---------------- Utilities ----------------
-def safe_headers():
-    return {"User-Agent": "misinfo-tool/1.0 (+https://example.com)"}
-def domain_from_url(url: str) -> Optional[str]:
-    if not url: return None
     try:
-        m = re.search(r"https?://(?:www\.)?([^/]+)/?", url)
-        if m:
-            domain = m.group(1).lower()
-            parts = domain.split('.')
-            if len(parts) > 2:
-                domain = ".".join(parts[-2:])
-            return domain
-    except Exception:
-        return None
-    return None
-# ---------------- Trusted / Blacklist ----------------
-TRUSTED_DOMAINS = {
-    "bbc.co.uk","bbc.com","cnn.com","nytimes.com","reuters.com","apnews.com",
-    "theguardian.com","npr.org","washingtonpost.com","wsj.com","usatoday.com",
-    "bloomberg.com","aljazeera.com","msnbc.com","cnbc.com","foxnews.com",
-    "scientificamerican.com","nature.com","sciencedaily.com"
-}
-BLACKLISTED_DOMAINS = {
-    "imdb.com","youtube.com","wikipedia.org","fandom.com","comicbook.com",
-    "rottentomatoes.com","hulu.com","netflix.com","ign.com","forbes.com"
-}
-UNWANTED_KEYWORDS = [
-    "movie","film","episode","tv show","trailer","comic","manga","fan","fandom",
-    "review","fiction","novel","fantasy","screenplay","actor","actress"
-]
-# ---------------- NLP classify ----------------
-def classify_text_type(text: str) -> Dict[str, Any]:
-    labels = ["news","rumor","fact","opinion","satire","unverifiable"]
-    pipe = get_zs_pipe()
-    if pipe:
-        try:
-            res = pipe(text, labels, multi_label=False, truncation=True)
-            label = res["labels"][0]
-            score = float(res["scores"][0])
-            return {"type": label, "score": round(score,3), "scores": dict(zip(res["labels"], res["scores"]))}
-        except Exception:
-            pass
-    t = text.lower()
-    if any(k in t for k in ["according to","reported","breaking","news","announced"]):
-        return {"type":"news","score":0.65,"scores":{}}
-    if any(k in t for k in ["i think","in my opinion","i believe","should"]):
-        return {"type":"opinion","score":0.7,"scores":{}}
-    if any(k in t for k in ["joke","satire","not real","parody"]):
-        return {"type":"satire","score":0.7,"scores":{}}
-    if any(k in t for k in ["study shows","research","published","peer-reviewed"]):
-        return {"type":"fact","score":0.6,"scores":{}}
-    return {"type":"rumor","score":0.45,"scores":{}}
-def summarize_text(text: str, max_len=300) -> str:
-    sentences = re.split(r'(?<=[.!?]) +', text.strip())
-    summary = sentences[0] if sentences else text
-    if len(summary) > max_len:
-        summary = summary[:max_len].rsplit(' ',1)[0] + "..."
-    return summary
-# ---------------- Search ----------------
-def fetch_gnews(query: str, max_results=6) -> List[Dict[str,str]]:
-    if not GNEWS_API_KEY:
         return []
     try:
-        url = "https://gnews.io/api/v4/search"
-        params = {"q": query, "token": GNEWS_API_KEY, "max": max_results, "lang":"en"}
-        r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
         r.raise_for_status()
         js = r.json()
-        return [{"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description")} for a in js.get("articles", [])[:max_results]]
-    except Exception:
         return []
-def fetch_newsapi(query: str, max_results=6) -> List[Dict[str,str]]:
-    if not NEWSORG_API_KEY:
         return []
     try:
-        url = "https://newsapi.org/v2/everything"
-        params = {"q": query, "pageSize": max_results, "apiKey": NEWSORG_API_KEY, "language":"en"}
-        r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
         r.raise_for_status()
         js = r.json()
-        return [{"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description")} for a in js.get("articles", [])[:max_results]]
-    except Exception:
         return []
-def duckduckgo_search(query: str, max_results=8) -> List[Dict[str,str]]:
     try:
-        url = "https://html.duckduckgo.com/html/"
-        r = requests.post(url, data={"q": query}, headers=safe_headers(), timeout=6)
         r.raise_for_status()
-        soup = BeautifulSoup(r.text, "html.parser")
         results = []
-        for res in soup.select(".result__a")[:max_results]:
-            title = res.get_text()
-            href = res.get("href")
-            snippet_node = res.find_parent().select_one(".result__snippet")
-            snippet = snippet_node.get_text() if snippet_node else ""
-            results.append({"title": title, "url": href, "source":None, "snippet": snippet})
         return results
-    except Exception:
         return []
-# ---------------- Optimized fetch all sources ----------------
-def fetch_all_sources(query: str) -> List[Dict[str,str]]:
-    with ThreadPoolExecutor(max_workers=3) as executor:
-        futures = [
-            executor.submit(fetch_gnews, query),
-            executor.submit(fetch_newsapi, query),
-            executor.submit(duckduckgo_search, query)
-        ]
-        results = []
-        for f in futures:
-            try:
-                results.extend(f.result())
-            except:
-                pass
-    return results
-# ---------------- Filtering ----------------
-def is_unwanted_snippet(snippet: str) -> bool:
-    if not snippet: return False
-    s = snippet.lower()
-    return any(k in s for k in UNWANTED_KEYWORDS)
-def filter_sources(sources: List[Dict[str,str]]) -> List[Dict[str,str]]:
-    kept, seen = [], set()
-    for s in sources:
-        url = s.get("url") or ""
-        if not url or url in seen: continue
         seen.add(url)
-        domain = domain_from_url(url)
-        s["domain"] = domain or ""
-        if not domain: continue
-        if domain in BLACKLISTED_DOMAINS: continue
-        if domain not in TRUSTED_DOMAINS: continue
-        if is_unwanted_snippet(s.get("snippet","")) or is_unwanted_snippet(s.get("title","")): continue
-        kept.append(s)
-    return kept
-# ---------------- Semantic filtering ----------------
-def compute_similarity(args):
-    claim_emb, snippet = args
-    model = get_sente_model()
-    if not model: return 0.0
-    snippet_emb = model.encode(snippet, convert_to_tensor=True)
-    from sentence_transformers import util
-    return util.cos_sim(claim_emb, snippet_emb).item()
-def semantic_filter_parallel(claim: str, sources: List[Dict[str,str]], threshold=0.3) -> List[Dict[str,str]]:
-    model = get_sente_model()
-    if not model or not sources:
-        return sources
-    claim_emb = model.encode(claim, convert_to_tensor=True)
-    args = [(claim_emb, s["snippet"]) for s in sources]
-    filtered = []
-    with ProcessPoolExecutor(max_workers=min(4, len(sources))) as executor:
-        sims = list(executor.map(compute_similarity, args))
-    for s, sim in zip(sources, sims):
-        if sim >= threshold:
-            filtered.append(s)
-    return filtered
-# ---------------- Evidence summary ----------------
-def summarize_evidence(sources: List[Dict[str,str]], max_chars=800) -> str:
-    if not sources:
-        return "No credible news sources found."
-    parts = []
-    for s in sources[:8]:
-        t = s.get("title") or ""
-        snip = s.get("snippet") or ""
-        domain = s.get("domain") or domain_from_url(s.get("url","")) or ""
-        parts.append(f"{t} ({domain}) — {snip}")
-    res = "\n".join(parts)
-    if len(res) > max_chars:
-        return res[:max_chars].rsplit(" ",1)[0] + "..."
-    return res
-# ---------------- Fusion ----------------
-def fuse_scores(fast_conf: float, deep_outcome: Optional[str], evidence_count: int) -> Dict[str,Any]:
-    base = fast_conf*0.5 + min(evidence_count/5.0,1.0)*0.5
-    if deep_outcome and deep_outcome.lower() in ["false","misleading"]:
-        base *= 0.7
-    score = int(round(max(0, min(1, base)) * 100))
-    color = "green" if score >= 70 else "yellow" if score >= 40 else "red"
-    return {"score":score, "color":color}
-# ---------------- Fact Check API ----------------
 def factcheck_claim(claim: str) -> Dict[str,Any]:
-    api_key = "AIzaSyB0A-MIHs8qkjYTWE-TnoLw46KplX-Ihjs"
-    url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
-    params = {"query": claim, "key": api_key, "languageCode": "en", "pageSize": 5}
     try:
-        r = requests.get(url, params=params, headers=safe_headers(), timeout=6)
         r.raise_for_status()
         js = r.json()
         claims = js.get("claims", [])
         results = []
         for c in claims:
-            claimant = c.get("claimant", "Unknown")
-            text = c.get("text", "")
-            claimReview = c.get("claimReview", [])
-            for review in claimReview:
-                publisher = review.get("publisher", {}).get("name")
-                url = review.get("url")
-                title = review.get("title")
-                review_rating = review.get("textualRating")
                 results.append({
-                    "claimant": claimant,
                     "text": text,
-                    "publisher": publisher,
-                    "url": url,
-                    "title": title,
-                    "rating": review_rating
                 })
-        outcome = "Unverified" if not results else results[0].get("rating", "Unverified")
         return {"outcome": outcome, "source": results}
     except Exception as e:
-        return {"outcome": "Error", "source": [], "error": str(e)}
-# ---------------- API ----------------
-@app.post("/verify")
-async def verify(req: VerifyRequest):
-    claim = (req.text or "").strip()
-    mode = (req.mode or "fast").lower()
-    if not claim:
-        raise HTTPException(status_code=400, detail="Empty claim")
-    # Step 1 classify
-    text_type_res = classify_text_type(claim)
-    stored_type = text_type_res["type"]
-    # Step 2 summarize
-    user_summary = summarize_text(claim)
-    # Step 3 search
-    query = f"{user_summary} site:bbc.com OR site:cnn.com OR site:reuters.com OR site:apnews.com"
-    all_raw = fetch_all_sources(query)
-    # Step 4 filter
-    filtered = filter_sources(all_raw)
-    # Step 4b semantic filter
-    filtered = semantic_filter_parallel(claim, filtered)
-    evidence_summary = summarize_evidence(filtered)
-    # Step 5 fast classification
-    fast_label, fast_conf = "Unverifiable", 0.4
     pipe = get_zs_pipe()
     if pipe:
         try:
-            cls = pipe(claim, ["True","False","Misleading","Unverifiable"], multi_label=False, truncation=True)
-            fast_label = cls["labels"][0]
-            fast_conf = float(cls["scores"][0])
-        except:
-            pass
-    # Step 6 deep (Gemini AI)
-    deep_result = None
-    if mode in ["deep","hybrid"]:
-        client = get_gemini_client()
-        if client:
-            try:
-                prompt = f'Verify claim: "{claim}". Output JSON: outcome, explanation, comparison, takeaways.'
-                resp = client.models.generate_content(model="gemini-2.5-flash", contents=prompt)
-                deep_result = json.loads(resp.text)
-            except:
-                deep_result = {"outcome":"Unverifiable","explanation":"Gemini API error","takeaways":["Check credible sources"]}
-        else:
-            deep_result = {"outcome":"Unverifiable","explanation":"Demo mode: API missing","takeaways":["Check credible sources"]}
-    # Step 7 fact-check API
-    factcheck = factcheck_claim(claim)
-    # Step 8 fuse scores
-    deep_outcome = deep_result.get("outcome") if deep_result else None
-    fuse = fuse_scores(fast_conf, deep_outcome, len(filtered))
-    return {
         "claim": claim,
-        "text_type": stored_type,
-        "text_type_scores": text_type_res.get("scores", {}),
-        "user_summary": user_summary,
-        "fast": {"label": fast_label, "confidence": round(fast_conf,3)},
-        "evidence_count_raw": len(all_raw),
-        "evidence_count_filtered": len(filtered),
-        "evidence": filtered,
-        "evidence_summary": evidence_summary,
-        "deep": deep_result or {},
-        "factcheck": factcheck,
-        "credibility": fuse
     }
-# ---------------- Frontend ----------------
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run("app:app", host="0.0.0.0", port=int(os.getenv("PORT","1748")))

+# misinfo_gradio_full.py
 import os
 import re
+import time
+import json
+import base64
+import logging
 from typing import List, Dict, Any, Optional
+import requests
+import trafilatura
+import tldextract
+import gradio as gr
+from PIL import Image
+import pytesseract
+# ML lazy-load
 ZS_PIPE = None
+SENTE = None
 GEMINI_CLIENT = None
+# Load env
+from dotenv import load_dotenv
+load_dotenv()
+NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")
+GNEWS_KEY = os.getenv("GNEWS_KEY")
+SERPAPI_KEY = os.getenv("SERPAPI_KEY")
+FACTCHECK_KEY = os.getenv("FACTCHECK_KEY")
+SAFE_BROWSING_KEY = os.getenv("SAFE_BROWSING_KEY")
+VIRUSTOTAL_KEY = os.getenv("VIRUSTOTAL_KEY")
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+# Logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("misinfo")
+# --- Helpers ---
+def safe_headers():
+    return {"User-Agent": "misinfo-gradio/1.0"}
+def extract_domain(url: str) -> Optional[str]:
+    try:
+        ext = tldextract.extract(url)
+        if ext.registered_domain:
+            return ext.registered_domain.lower()
+    except Exception:
+        pass
+    return None
+TRUSTED_DOMAINS = {
+    "bbc.co.uk","bbc.com","cnn.com","nytimes.com","reuters.com","apnews.com",
+    "theguardian.com","npr.org","washingtonpost.com","wsj.com","usatoday.com",
+    "bloomberg.com","aljazeera.com","msnbc.com","cnbc.com","foxnews.com",
+    "scientificamerican.com","nature.com","sciencedaily.com","timesofindia.indiatimes.com","indiatimes.com"
+}
+BLACKLISTED_DOMAINS = {"example-bad-site.com"}  # keep small; replace with curated list in prod
+# --- Model loaders ---
 def get_zs_pipe():
     global ZS_PIPE
     if ZS_PIPE is None:
         try:
             from transformers import pipeline
             ZS_PIPE = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")
+        except Exception as e:
+            logger.warning("zero-shot pipeline load error: %s", e)
             ZS_PIPE = None
     return ZS_PIPE
 def get_sente_model():
+    global SENTE
+    if SENTE is None:
         try:
             from sentence_transformers import SentenceTransformer
+            SENTE = SentenceTransformer("all-MiniLM-L6-v2")
+        except Exception as e:
+            logger.warning("sentence-transformers load error: %s", e)
+            SENTE = None
+    return SENTE
 def get_gemini_client():
     global GEMINI_CLIENT
+    if GEMINI_CLIENT is None and GEMINI_API_KEY:
         try:
             from google import genai
+            GEMINI_CLIENT = genai.Client(api_key=GEMINI_API_KEY)
+        except Exception as e:
+            logger.warning("gemini client init error: %s", e)
             GEMINI_CLIENT = None
     return GEMINI_CLIENT
+# --- Extraction ---
+def fetch_and_extract(url: str, max_chars: int = 4000) -> str:
+    """Use trafilatura to fetch & extract main article text."""
+    if not url:
+        return ""
     try:
+        downloaded = trafilatura.fetch_url(url, headers=safe_headers(), timeout=12)
+        if not downloaded:
+            return ""
+        text = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
+        if not text:
+            return ""
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text[:max_chars]
+    except Exception as e:
+        logger.warning("fetch_and_extract error: %s", e)
+        return ""
+def ocr_image_to_text(img: Image.Image, max_chars=4000) -> str:
+    try:
+        text = pytesseract.image_to_string(img)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text[:max_chars]
+    except Exception as e:
+        logger.warning("OCR error: %s", e)
+        return ""
+# --- News / evidence fetching ---
+def fetch_newsapi(query: str, max_results: int = 6) -> List[Dict[str,str]]:
+    if not NEWSAPI_KEY:
         return []
     try:
+        url = "https://newsapi.org/v2/everything"
+        params = {"q": query, "pageSize": max_results, "apiKey": NEWSAPI_KEY, "language": "en", "sortBy": "relevancy"}
+        r = requests.get(url, params=params, headers=safe_headers(), timeout=8)
         r.raise_for_status()
         js = r.json()
+        articles = []
+        for a in js.get("articles", [])[:max_results]:
+            articles.append({"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description") or a.get("content") or ""})
+        return articles
+    except Exception as e:
+        logger.warning("NewsAPI error: %s", e)
         return []
+def fetch_gnews(query: str, max_results: int = 6) -> List[Dict[str,str]]:
+    if not GNEWS_KEY:
         return []
     try:
+        url = "https://gnews.io/api/v4/search"
+        params = {"q": query, "token": GNEWS_KEY, "max": max_results, "lang": "en"}
+        r = requests.get(url, params=params, headers=safe_headers(), timeout=8)
         r.raise_for_status()
         js = r.json()
+        return [{"title": a.get("title"), "url": a.get("url"), "source": a.get("source",{}).get("name"), "snippet": a.get("description") or ""} for a in js.get("articles", [])[:max_results]]
+    except Exception as e:
+        logger.warning("GNews error: %s", e)
         return []
+def fetch_serpapi(query: str, max_results: int = 6) -> List[Dict[str,str]]:
+    if not SERPAPI_KEY:
+        return []
     try:
+        url = "https://serpapi.com/search.json"
+        params = {"q": query, "api_key": SERPAPI_KEY, "num": max_results, "engine": "google"}
+        r = requests.get(url, params=params, headers=safe_headers(), timeout=8)
         r.raise_for_status()
+        js = r.json()
         results = []
+        for item in js.get("organic_results", [])[:max_results]:
+            results.append({"title": item.get("title"), "url": item.get("link"), "source": item.get("source") or item.get("displayed_link"), "snippet": item.get("snippet") or ""})
         return results
+    except Exception as e:
+        logger.warning("SerpApi error: %s", e)
         return []
+def gather_news_evidence(query: str, max_results=6) -> List[Dict[str,str]]:
+    items = []
+    items.extend(fetch_newsapi(query, max_results))
+    items.extend(fetch_gnews(query, max_results))
+    items.extend(fetch_serpapi(query, max_results))
+    # dedupe by url
+    seen = set()
+    dedup = []
+    for it in items:
+        url = it.get("url")
+        if not url or url in seen:
+            continue
         seen.add(url)
+        dedup.append(it)
+    return dedup[:max_results]
+# --- Fact-check (Google Fact Check Tools) ---
 def factcheck_claim(claim: str) -> Dict[str,Any]:
+    if not FACTCHECK_KEY:
+        return {"outcome": "api_key_missing", "source": []}
     try:
+        url = "https://factchecktools.googleapis.com/v1alpha1/claims:search"
+        params = {"query": claim, "key": FACTCHECK_KEY, "languageCode": "en", "pageSize": 5}
+        r = requests.get(url, params=params, headers=safe_headers(), timeout=8)
         r.raise_for_status()
         js = r.json()
         claims = js.get("claims", [])
         results = []
         for c in claims:
+            text = c.get("text")
+            for review in c.get("claimReview", []):
                 results.append({
+                    "claimant": c.get("claimant"),
                     "text": text,
+                    "publisher": review.get("publisher", {}).get("name"),
+                    "title": review.get("title"),
+                    "url": review.get("url"),
+                    "rating": review.get("textualRating")
                 })
+        outcome = "unverified" if not results else results[0].get("rating", "unverified")
         return {"outcome": outcome, "source": results}
     except Exception as e:
+        logger.warning("factcheck error: %s", e)
+        return {"outcome": "error", "error": str(e), "source": []}
+# --- Safe Browsing (Google) ---
+def check_safe_browsing(url: str) -> Dict[str,Any]:
+    if not SAFE_BROWSING_KEY:
+        return {"status": "api_key_missing"}
+    try:
+        endpoint = f"https://safebrowsing.googleapis.com/v4/threatMatches:find?key={SAFE_BROWSING_KEY}"
+        payload = {
+            "client": {"clientId": "misinfo-gradio", "clientVersion": "1.0"},
+            "threatInfo": {
+                "threatTypes": ["MALWARE", "SOCIAL_ENGINEERING", "UNWANTED_SOFTWARE", "POTENTIALLY_HARMFUL_APPLICATION"],
+                "platformTypes": ["ANY_PLATFORM"],
+                "threatEntryTypes": ["URL"],
+                "threatEntries": [{"url": url}]
+            }
+        }
+        r = requests.post(endpoint, json=payload, headers=safe_headers(), timeout=8)
+        r.raise_for_status()
+        js = r.json()
+        return {"status": "ok", "matches": js.get("matches", [])}
+    except Exception as e:
+        logger.warning("safe browsing error: %s", e)
+        return {"status": "error", "error": str(e)}
+# --- VirusTotal check (best-effort) ---
+def check_virustotal(url: str) -> Dict[str,Any]:
+    if not VIRUSTOTAL_KEY:
+        return {"status": "api_key_missing"}
+    try:
+        # Submit URL to /urls to get id
+        submit = requests.post("https://www.virustotal.com/api/v3/urls", data={"url": url}, headers={"x-apikey": VIRUSTOTAL_KEY}, timeout=10)
+        submit.raise_for_status()
+        data = submit.json()
+        url_id = data.get("data", {}).get("id")
+        if not url_id:
+            return {"status": "error", "error": "no_id"}
+        # Get analysis/summary (v3 has endpoints /urls/{id})
+        r = requests.get(f"https://www.virustotal.com/api/v3/urls/{url_id}", headers={"x-apikey": VIRUSTOTAL_KEY}, timeout=10)
+        r.raise_for_status()
+        info = r.json()
+        return {"status": "ok", "info": info}
+    except Exception as e:
+        logger.warning("virustotal error: %s", e)
+        return {"status": "error", "error": str(e)}
+# --- Semantic evidence selection ---
+def select_relevant_sentences(claim: str, article_text: str, top_k: int = 5) -> List[str]:
+    model = get_sente_model()
+    if not model:
+        # fallback: return first sentences
+        sents = re.split(r'(?<=[.!?]) +', article_text)
+        return [s.strip() for s in sents[:top_k] if s.strip()]
+    # split into sentences and compute similarity
+    sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', article_text) if s.strip()]
+    if not sentences:
+        return []
+    try:
+        claim_emb = model.encode(claim, convert_to_tensor=True)
+        sent_embs = model.encode(sentences, convert_to_tensor=True)
+        import numpy as np
+        from sentence_transformers import util
+        sims = util.cos_sim(claim_emb, sent_embs)[0].cpu().numpy()
+        idxs = list(np.argsort(-sims)[:top_k])
+        selected = [sentences[i] for i in idxs if i < len(sentences)]
+        return selected
+    except Exception as e:
+        logger.warning("semantic selection error: %s", e)
+        # fallback
+        return sentences[:top_k]
+# --- Zero-shot classification (truth + content type) ---
+def zero_shot_classify(text: str) -> Dict[str,Any]:
     pipe = get_zs_pipe()
+    res = {}
     if pipe:
         try:
+            truth_labels = ["True", "False", "Misleading", "Unverifiable"]
+            r1 = pipe(text, truth_labels, multi_label=False, truncation=True)
+            res["truth_label"] = r1["labels"][0]
+            res["truth_score"] = float(r1["scores"][0])
+        except Exception as e:
+            logger.warning("zero-shot truth error: %s", e)
+            res["truth_label"] = "Unknown"; res["truth_score"] = 0.0
+        try:
+            type_labels = ["News","Opinion","Satire","Rumor"]
+            r2 = pipe(text, type_labels, multi_label=False, truncation=True)
+            res["content_type"] = r2["labels"][0]
+            res["content_type_score"] = float(r2["scores"][0])
+        except Exception as e:
+            logger.warning("zero-shot content type error: %s", e)
+            res["content_type"] = "Unknown"; res["content_type_score"] = 0.0
+    else:
+        res = {"truth_label":"Unknown","truth_score":0.0,"content_type":"Unknown","content_type_score":0.0}
+    return res
+# --- Gemini deep verification ---
+def gemini_verify(claim: str, evidence: List[str], domain: Optional[str]) -> Dict[str,Any]:
+    client = get_gemini_client()
+    if not client:
+        return {"outcome": "api_missing", "explanation": "Gemini API key not set or client failed", "raw": None}
+    # structured prompt asking for JSON
+    prompt = (
+        "You are an expert fact-checker. Given the claim and evidence, output valid JSON with keys:\n"
+        "outcome (one of: True, False, Misleading, Unverifiable),\n"
+        "confidence (0-1),\n"
+        "explanation (short),\n"
+        "takeaways (list of 1-3 short tips),\n"
+        "sources (list of cited sources if any).\n\n"
+        f"Claim: {claim}\n\n"
+        f"Domain: {domain}\n\n"
+        "Evidence:\n" + ("\n".join(f"- {e}" for e in evidence)) + "\n\n"
+        "Provide only JSON in the response."
+    )
+    try:
+        resp = client.models.generate_content(model="gemini-2.5-flash", contents=prompt)
+        text = resp.text
+        # attempt to parse JSON substring
+        try:
+            parsed = json.loads(text)
+            return {"outcome":"ok", "result": parsed, "raw": text}
+        except Exception:
+            # try to find first { ... } substring
+            m = re.search(r'(\{.*\})', text, flags=re.S)
+            if m:
+                try:
+                    parsed = json.loads(m.group(1))
+                    return {"outcome":"ok", "result": parsed, "raw": text}
+                except Exception:
+                    return {"outcome":"parse_error", "raw": text}
+            return {"outcome":"no_json", "raw": text}
+    except Exception as e:
+        logger.warning("gemini error: %s", e)
+        return {"outcome":"error", "error": str(e)}
+# --- Fusion of signals into credibility score ---
+def fuse_signals(truth_score: float, domain: Optional[str], evidence_count: int, gemini_outcome: Optional[Dict[str,Any]]) -> Dict[str,Any]:
+    # base from truth_score (0-1)
+    base = truth_score
+    # domain trust
+    domain_factor = 1.0
+    if domain:
+        if domain in TRUSTED_DOMAINS:
+            domain_factor += 0.2
+        elif domain in BLACKLISTED_DOMAINS:
+            domain_factor -= 0.4
+        else:
+            domain_factor += 0.0
+    # evidence factor (cap to 1)
+    evidence_factor = min(evidence_count / 5.0, 1.0)
+    # gemini adjustment
+    gemini_adj = 1.0
+    if gemini_outcome and gemini_outcome.get("result"):
+        res = gemini_outcome["result"]
+        out = res.get("outcome", "").lower()
+        conf = float(res.get("confidence", 0.5)) if isinstance(res.get("confidence", 0.5), (float,int,str)) else 0.5
+        if out in ("false","misleading"):
+            gemini_adj -= 0.25 * conf
+        elif out == "true":
+            gemini_adj += 0.1 * conf
+        elif out == "unverifiable":
+            gemini_adj -= 0.05 * conf
+    # combine
+    score = base * 0.5 + evidence_factor * 0.3 + (domain_factor - 1.0) * 0.2
+    score = score * gemini_adj
+    score = max(0.0, min(1.0, score))
+    pct = int(round(score * 100))
+    color = "green" if pct >= 70 else "yellow" if pct >= 40 else "red"
+    return {"score": pct, "color": color, "raw": score}
+# --- Main pipeline: single mode (run everything) ---
+def analyze_pipeline(article: Optional[str], url: Optional[str], image: Optional[Image.Image], claim_override: Optional[str], top_k_evidence: int = 5):
+    # 1) choose text source
+    source = None
+    text = ""
+    domain = None
+    if article and article.strip():
+        source = "article"
+        text = article.strip()
+    elif url and url.strip():
+        source = "url"
+        domain = extract_domain(url)
+        text = fetch_and_extract(url) or ""
+    elif image is not None:
+        source = "image"
+        text = ocr_image_to_text(image) or ""
+    else:
+        return {"error": "No input provided. Paste article text, or a URL, or upload image."}
+    # limit text
+    if len(text) > 4000:
+        text = text[:4000]
+    # claim to check: use explicit claim_override or try to use first sentence/headline
+    claim = claim_override.strip() if claim_override and claim_override.strip() else (re.split(r'(?<=[.!?]) +', text.strip())[0] if text else "")
+    # 2) quick zero-shot classification
+    zs = zero_shot_classify(text if len(claim) < 30 else claim)  # run on claim if short, else on text
+    truth_label = zs.get("truth_label")
+    truth_score = zs.get("truth_score", 0.0)
+    content_type = zs.get("content_type")
+    content_type_score = zs.get("content_type_score", 0.0)
+    # 3) evidence: internal (from article) and external (news APIs)
+    internal_evidence = select_relevant_sentences(claim or text, text, top_k=top_k_evidence) if text else []
+    # external news queries: search using claim or summary
+    query = claim or (text[:200])
+    external_articles = gather_news_evidence(query, max_results=6)
+    # filter to credible domains
+    ext_filtered = []
+    for a in external_articles:
+        dom = extract_domain(a.get("url") or "")
+        a["domain"] = dom
+        if dom and dom in TRUSTED_DOMAINS:
+            ext_filtered.append(a)
+    # 4) fact-check API
+    fact = factcheck_claim(claim or text)
+    # 5) safe browsing + virustotal only if URL input provided
+    safe_browsing_res = check_safe_browsing(url) if url else {"status":"no_url"}
+    virustotal_res = check_virustotal(url) if url else {"status":"no_url"}
+    # 6) deep verify with Gemini (claim + internal+external evidence)
+    evidence_for_gemini = internal_evidence[:top_k_evidence] + [ (a.get("title") or "") + " - " + (a.get("snippet") or "") for a in ext_filtered[:top_k_evidence] ]
+    gemini_res = gemini_verify(claim or text, evidence_for_gemini, domain)
+    # 7) fuse signals
+    credibility = fuse_signals(truth_score, domain, len(internal_evidence) + len(ext_filtered), gemini_res)
+    # 8) build outputs & tips
+    tips = (
+        "- Check the source domain and author.\n"
+        "- Cross-check the claim with multiple trusted outlets.\n"
+        "- Look for official statements or peer-reviewed studies for scientific claims.\n"
+        "- Be skeptical of sensational language and images without context."
+    )
+    out = {
+        "source": source,
+        "domain": domain,
         "claim": claim,
+        "text_snippet": text[:800],
+        "quick_classification": {"truth_label": truth_label, "truth_score": truth_score, "content_type": content_type, "content_type_score": content_type_score},
+        "internal_evidence": internal_evidence,
+        "external_evidence": ext_filtered,
+        "factcheck": fact,
+        "safe_browsing": safe_browsing_res,
+        "virustotal": {"status": virustotal_res.get("status", "unknown"), "summary": (virustotal_res.get("info") or {}) if isinstance(virustotal_res, dict) else {}},
+        "gemini_verification": gemini_res,
+        "credibility": credibility,
+        "tips": tips
     }
+    return out
+# --- Gradio UI ---
+def pretty_output(result: Dict[str,Any]):
+    if not isinstance(result, dict):
+        return str(result), "", "", "", ""
+    if result.get("error"):
+        return result["error"], "", "", "", ""
+    # format sections
+    header = f"Credibility score: {result['credibility']['score']} ({result['credibility']['color']})"
+    quick = json.dumps(result.get("quick_classification", {}), indent=2)
+    evidence = ""
+    if result.get("internal_evidence"):
+        evidence += "Internal evidence (from article):\n" + "\n".join(f"- {s}" for s in result["internal_evidence"]) + "\n\n"
+    if result.get("external_evidence"):
+        evidence += "External corroborating articles:\n" + "\n".join(f"- {a.get('title')} ({a.get('domain')}) — {a.get('url')}" for a in result["external_evidence"]) + "\n\n"
+    fact = json.dumps(result.get("factcheck", {}), indent=2)
+    gemini = result.get("gemini_verification", {})
+    gemini_text = json.dumps(gemini, indent=2) if gemini else ""
+    tips = result.get("tips", "")
+    return header, quick, evidence, fact, gemini_text + "\n\n" + tips
+with gr.Blocks() as demo:
+    gr.Markdown("# 🛡️ Unified Misinformation Detector (single mode)")
+    gr.Markdown("Provide either Article text (preferred), or a URL, or upload an image (screenshot). Optionally add a short claim to check.")
+    with gr.Row():
+        article_in = gr.Textbox(lines=6, label="Paste Article Text (preferred)")
+        url_in = gr.Textbox(label="Article URL")
+        image_in = gr.Image(type="pil", label="Upload Image (screenshot)")
+    claim_in = gr.Textbox(lines=1, label="Optional short claim (override automatic claim extraction)")
+    topk = gr.Slider(1, 8, value=5, step=1, label="Top-K evidence sentences")
+    run_btn = gr.Button("Run Full Pipeline")
+    out_header = gr.Textbox(label="Summary", interactive=False)
+    out_quick = gr.Code(label="Quick classification (truth + content type)")
+    out_evidence = gr.Textbox(label="Evidence & External articles", lines=12)
+    out_factcheck = gr.Code(label="Fact-check API result")
+    out_gemini = gr.Code(label="Gemini result + Tips")
+    def run(article, url, image, claim_override, top_k):
+        res = analyze_pipeline(article, url, image, claim_override, top_k_evidence=int(top_k))
+        return pretty_output(res)
+    run_btn.click(run, inputs=[article_in, url_in, image_in, claim_in, topk], outputs=[out_header, out_quick, out_evidence, out_factcheck, out_gemini])
 if __name__ == "__main__":
+    demo.launch()