Spaces:

Pragthedon
/

proofly

Running

File size: 14,238 Bytes

# ==========================================
# IMPORTS
# ==========================================
import os
import requests
import faiss
import numpy as np
import urllib.parse
from bs4 import BeautifulSoup
import feedparser
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Import shared config and database layer
from project.config import (
    FAISS_FILE, NEWS_API_KEY,
    USER_AGENT, SENTENCE_TRANSFORMER_MODEL, NLI_MODEL as NLI_MODEL_NAME
)
from project.database import init_db, clear_db, save_evidence, load_all_evidence
from knowledge_base import KNOWLEDGE_BASE

# ==========================================
# MODEL LOADING
# ==========================================
embed_model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
nli_model = pipeline(
    "text-classification",
    model=NLI_MODEL_NAME
)

# ==========================================
# RELEVANCE CHECK
# ==========================================
def is_relevant(claim_emb, text, threshold=0.15):
    """Encodes text and checks similarity against claim. 
    Returns (bool_is_relevant, embedding_as_list)."""
    emb = embed_model.encode([text], normalize_embeddings=True)
    sim = float(np.dot(claim_emb, emb[0]))
    print(f"[DEBUG] Checking relevance for: '{text[:50]}...' Score: {sim:.4f}")
    return sim >= threshold, emb[0].tolist()

def get_search_query(claim):
    stop_words = set(["is", "am", "are", "was", "were", "be", "been", "being",
                      "the", "a", "an", "and", "but", "or", "on", "in", "with", "of", "to", "for",
                      "he", "she", "it", "they", "we", "i", "you", "that", "this", "these", "those",
                      "have", "has", "had", "do", "does", "did", "not", "no", "yes", "from"])
    words = [w for w in claim.split() if w.lower() not in stop_words]
    # Return top words to form a potent query (e.g. "modi president india")
    return " ".join(words[:5])

# ==========================================
# RSS FETCH
# ==========================================
def fetch_rss(claim_emb):
    print("[RSS] Fetching...")
    feeds = [
        "http://feeds.bbci.co.uk/news/rss.xml",
        "http://rss.cnn.com/rss/edition.rss",
        "https://www.aljazeera.com/xml/rss/all.xml",
        "https://www.theguardian.com/world/rss",
        "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
        "https://timesofindia.indiatimes.com/rss.cms",
        "https://www.hindustantimes.com/feeds/rss/topstories.rss",
        "https://cfo.economictimes.indiatimes.com/rss",
        "https://www.business-standard.com/rss/",
        "https://www.thehindu.com/news/national/feeder/default.rss",
        "https://indianexpress.com/section/india/feed/",
        "https://feeds.feedburner.com/ndtvnews-top-stories"
    ]
    count = 0
    for url in feeds:
        try:
            feed = feedparser.parse(url)
            print(f"[RSS] Parsed {url}, found {len(feed.entries)} entries")
            for entry in feed.entries[:5]:
                title = entry.title
                if title:
                    relevant, emb = is_relevant(claim_emb, title)
                    if relevant:
                        save_evidence(title, "RSS", embedding=emb)
                        count += 1
        except Exception as e:
            print(f"[RSS] Error parsing {url}: {e}")
    print(f"[RSS] Saved {count} items.")

# ==========================================
# GDELT FETCH
# ==========================================
def fetch_gdelt(claim, claim_emb):
    print("[GDELT] Fetching...")
    search_query = get_search_query(claim)
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    params = {
        "query": search_query,
        "mode": "ArtList",
        "format": "json",
        "maxrecords": 5
    }

    added = 0
    try:
        r = requests.get(url, params=params, timeout=10)
        r.raise_for_status()
        data = r.json()
        articles = data.get("articles", [])
        print(f"[GDELT] Found {len(articles)} articles")

        for art in articles:
            title = art.get("title", "")
            if title:
                relevant, emb = is_relevant(claim_emb, title)
                if relevant:
                    save_evidence(title, "GDELT", embedding=emb)
                    added += 1
    except Exception as e:
        print("[WARNING] GDELT failed:", e)

    print(f"[GDELT] Saved {added} items.")
    return added

# ==========================================
# NEWS API FETCH
# ==========================================
def fetch_newsapi(claim, claim_emb):
    print("[NewsAPI] Fetching...")

    if not NEWS_API_KEY:
        print("[WARNING] NEWS_API_KEY is not set in .env — skipping NewsAPI.")
        return 0

    url = "https://newsapi.org/v2/everything"
    search_query = get_search_query(claim)
    params = {
        "q": search_query,
        "apiKey": NEWS_API_KEY,
        "language": "en",
        "sortBy": "relevancy",
        "pageSize": 5
    }

    added = 0
    try:
        r = requests.get(url, params=params, timeout=10)
        data = r.json()

        if r.status_code != 200:
            print(f"[WARNING] NewsAPI Error: {data.get('message', 'Unknown error')}")
            return 0

        articles = data.get("articles", [])
        print(f"[NewsAPI] Found {len(articles)} articles")

        for art in articles:
            title = art.get("title", "")
            description = art.get("description", "") or ""
            content = f"{title}. {description}".strip(". ")

            if content:
                relevant, emb = is_relevant(claim_emb, content, threshold=0.05)
                if relevant:
                    save_evidence(content, f"NewsAPI: {art.get('source', {}).get('name', 'Unknown')}", embedding=emb)
                    added += 1
    except Exception as e:
        print("[WARNING] NewsAPI failed:", e)

    print(f"[NewsAPI] Saved {added} items.")
    return added

# ==========================================
# WIKIPEDIA (REST API)
# ==========================================
def fetch_wikipedia(claim):
    print("[Wikipedia] Fetching...")
    search_query = get_search_query(claim)
    try:
        query = urllib.parse.quote(search_query)
        url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query}&format=json"
        headers = {"User-Agent": USER_AGENT}
        r = requests.get(url, headers=headers, timeout=10)
        data = r.json()

        results = data.get("query", {}).get("search", [])
        print(f"[Wikipedia] Found {len(results)} search results")

        saved = 0
        for result in results[:3]:
            title = result["title"]
            page_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(title)}"
            r2 = requests.get(page_url, headers=headers, timeout=5)
            if r2.status_code == 200:
                extract = r2.json().get("extract", "")
                if len(extract) > 20:
                    claim_emb_wiki = embed_model.encode([claim], normalize_embeddings=True)
                    relevant, emb = is_relevant(claim_emb_wiki[0], extract, threshold=0.05)
                    if relevant:
                        save_evidence(extract, f"Wikipedia: {title}", embedding=emb)
                        saved += 1
        print(f"[Wikipedia] Saved {saved} items.")

    except Exception as e:
        print("[WARNING] Wikipedia failed:", e)

# ==========================================
# STATIC KNOWLEDGE BASE
# ==========================================
def fetch_knowledge_base(claim, claim_emb, threshold=0.30):
    """Query the curated static knowledge base using embedding similarity.
    This is called first so timeless facts always get reliable evidence."""
    print("[KnowledgeBase] Querying static knowledge base...")
    saved = 0
    for entry in KNOWLEDGE_BASE:
        text = entry["text"]
        source = entry["source"]
        emb_text = embed_model.encode([text], normalize_embeddings=True)
        sim = float(np.dot(claim_emb, emb_text[0]))
        if sim >= threshold:
            save_evidence(text, source, embedding=emb_text[0].tolist())
            saved += 1
    print(f"[KnowledgeBase] Saved {saved} matching entries (threshold={threshold}).")
    return saved

# ==========================================
# WIKIDATA ENTITY SEARCH
# ==========================================
def fetch_wikidata(claim, claim_emb, threshold=0.10):
    """Fetch entity summaries from Wikidata's free public API.
    No API key required. Good for factual entity-level claims."""
    print("[Wikidata] Fetching...")
    search_query = get_search_query(claim)
    try:
        url = "https://www.wikidata.org/w/api.php"
        params = {
            "action": "wbsearchentities",
            "search": search_query,
            "language": "en",
            "format": "json",
            "limit": 5,
            "type": "item"
        }
        headers = {"User-Agent": USER_AGENT}
        r = requests.get(url, params=params, headers=headers, timeout=8)
        r.raise_for_status()
        data = r.json()
        results = data.get("search", [])
        print(f"[Wikidata] Found {len(results)} entities")

        saved = 0
        for item in results:
            description = item.get("description", "")
            label = item.get("label", "")
            if description and label:
                text = f"{label}: {description}"
                relevant, emb = is_relevant(claim_emb, text, threshold=threshold)
                if relevant:
                    save_evidence(text, "Wikidata", embedding=emb)
                    saved += 1
        print(f"[Wikidata] Saved {saved} items.")
        return saved
    except Exception as e:
        print(f"[WARNING] Wikidata failed: {e}")
        return 0

# ==========================================
# DUCKDUCKGO FALLBACK
# ==========================================
def fetch_duckduckgo(claim, claim_emb):
    print("[Fallback] DuckDuckGo activated...")
    search_query = get_search_query(claim)
    try:
        query = urllib.parse.quote(search_query)
        url = f"https://duckduckgo.com/html/?q={query}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        r = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        results = soup.find_all("a", class_="result__a", limit=5)
        print(f"[DuckDuckGo] Found {len(results)} results")

        saved = 0
        for res in results:
            text = res.get_text()
            if len(text) > 30:
                relevant, emb = is_relevant(claim_emb, text, 0.05)
                if relevant:
                    save_evidence(text, "DuckDuckGo", embedding=emb)
                    saved += 1
        print(f"[DuckDuckGo] Saved {saved} items")
    except Exception as e:
        print("[WARNING] DuckDuckGo failed:", e)

# ==========================================
# BUILD FAISS
# ==========================================
def build_faiss():
    """Loads pre-calculated embeddings from Database and builds index.
    No re-encoding performed here — drastically reduces RAM peaks."""
    rows = load_all_evidence()
    if not rows:
        return False

    # Filter rows that actually have embeddings
    texts = []
    embeddings_list = []
    for row in rows:
        if row[3]: # row[3] is the embedding
            texts.append(row[1])
            embeddings_list.append(row[3])
    
    if not embeddings_list:
        return False

    embeddings = np.array(embeddings_list).astype('float32')
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)

    faiss.write_index(index, FAISS_FILE)
    return True

# ==========================================
# MAIN PIPELINE (CLI / standalone use)
# ==========================================
def run_fact_check(claim):
    print("\n[FACT CHECK]", claim)

    init_db()
    clear_db()

    claim_emb = embed_model.encode([claim], normalize_embeddings=True)

    # Fetch from all sources (now includes NewsAPI, consistent with api_wrapper)
    fetch_rss(claim_emb)
    gdelt_count = fetch_gdelt(claim, claim_emb)
    newsapi_count = fetch_newsapi(claim, claim_emb)
    fetch_wikipedia(claim)

    from project.database import get_total_evidence_count
    total_count = get_total_evidence_count()

    activate_fallback = (gdelt_count + newsapi_count) == 0 or total_count < 3

    if build_faiss():
        if os.path.exists(FAISS_FILE):
            index = faiss.read_index(FAISS_FILE)
            D, _ = index.search(claim_emb, 1)
            if len(D) > 0 and len(D[0]) > 0:
                similarity = D[0][0]
                if similarity < 0.50:
                    activate_fallback = True

    if activate_fallback:
        fetch_duckduckgo(claim, claim_emb)
        build_faiss()

    if not os.path.exists(FAISS_FILE):
        print("[ERROR] No evidence found.")
        return

    index = faiss.read_index(FAISS_FILE)
    D, I = index.search(claim_emb, 5)
    rows = load_all_evidence()

    print("\n[EVIDENCE]")
    for idx in I[0]:
        if idx < len(rows):
            print("-", rows[idx][1][:200])

    print("\n[NLI RESULTS]")
    for idx in I[0]:
        if idx < len(rows):
            evidence_text = rows[idx][1]
            candidate_labels = [
                f"Supports the claim: {claim}",
                f"Contradicts the claim: {claim}",
                f"Is unrelated to the claim: {claim}"
            ]
            result = nli_model(evidence_text, candidate_labels=candidate_labels)
            
            if result and 'labels' in result:
                top_label = result['labels'][0]
                top_score = result['scores'][0]
                print(f"[{top_label}] (Score: {top_score:.2f})")
            else:
                print(result)

# ==========================================
# RUN
# ==========================================
if __name__ == "__main__":
    claim = input("Enter claim: ").strip()
    if claim:
        run_fact_check(claim)
    else:
        print("Claim cannot be empty.")