Spaces:

Pragthedon
/

proofly

Running

File size: 11,071 Bytes

# ==========================================
# API WRAPPER FOR FLASK
# ==========================================
from project.database import get_total_evidence_count, load_all_evidence

def run_fact_check_api(claim):
    """
    API-friendly version that returns structured data instead of printing.
    Returns dict with evidence, NLI results, and metadata.
    
    Note: This is a simplified version for demo. For full functionality,
    install all dependencies from requirements.txt
    """
    try:
        # Try to import the model
        from model import (
            init_db, clear_db, embed_model, fetch_rss, fetch_gdelt, fetch_newsapi,
            fetch_wikipedia, fetch_duckduckgo, fetch_knowledge_base, fetch_wikidata,
            build_faiss, load_all_evidence, nli_model, FAISS_FILE
        )
        import faiss
        
        # Full implementation
        init_db()
        # clear_db() - Removed to allow accumulation of facts

        claim_emb = embed_model.encode([claim], normalize_embeddings=True)

        # 1. Static knowledge base (offline, always runs first)
        kb_count = fetch_knowledge_base(claim, claim_emb)

        # ── Quick KB short-circuit ──────────────────────────────────────
        # If KB already found strong matches, build a temporary FAISS and
        # check the best similarity score. If it's high (≥ 0.65) we have
        # enough reliable evidence — skip the slow live fetches entirely.
        kb_short_circuit = False
        if kb_count >= 1:
            if build_faiss():
                _idx = faiss.read_index(FAISS_FILE)
                _D, _ = _idx.search(claim_emb, 1)
                if len(_D[0]) > 0 and _D[0][0] >= 0.65:
                    kb_short_circuit = True
                    print(f"[KnowledgeBase] Strong match (score={_D[0][0]:.2f}) — skipping live fetches.")
        # ───────────────────────────────────────────────────────────────

        # 2. Wikidata entity search (fast, no API key — always runs)
        fetch_wikidata(claim, claim_emb)

        # ── Database Evidence Search (Vector Cache) ───────────────────
        # Before doing slow live scraping, check if our database already has
        # highly relevant evidence from previous fact-checks of similar topics.
        local_evidence_found = False
        if not kb_short_circuit and build_faiss():
            _idx = faiss.read_index(FAISS_FILE)
            if _idx.ntotal > 0:
                _D, _ = _idx.search(claim_emb, 1)
                if len(_D[0]) > 0 and _D[0][0] >= 0.60:
                    local_evidence_found = True
                    print(f"[VectorCache] Strong local evidence found (score={_D[0][0]:.2f}) — skipping live scrapes.")
        # ───────────────────────────────────────────────────────────────

        # 3. Live fetches — skipped when KB or local DB already has strong matches
        gdelt_count = 0
        newsapi_count = 0
        if not kb_short_circuit and not local_evidence_found:
            fetch_rss(claim_emb)
            gdelt_count = fetch_gdelt(claim, claim_emb)
            newsapi_count = fetch_newsapi(claim, claim_emb)
            fetch_wikipedia(claim)


        # Count evidence
        total_count = get_total_evidence_count()

        activate_fallback = False

        if (gdelt_count + newsapi_count) == 0 or total_count < 3:
            activate_fallback = True

        faiss_ready = build_faiss()

        if faiss_ready:
            index = faiss.read_index(FAISS_FILE)
            D, _ = index.search(claim_emb, 1)
            if len(D) > 0 and len(D[0]) > 0 and D[0][0] < 0.50:
                activate_fallback = True

        if activate_fallback:
            fetch_duckduckgo(claim, claim_emb)
            faiss_ready = build_faiss()

        if not faiss_ready:
            return {
                "success": False,
                "error": "No relevant evidence found.",
                "evidence": [],
                "nli_results": []
            }

        index = faiss.read_index(FAISS_FILE)
        # Search wider first (10 items), then de-duplicate
        top_k = min(10, index.ntotal)
        D, I = index.search(claim_emb, top_k)

        rows = load_all_evidence()

        # De-duplicate by text content and apply minimum similarity threshold
        seen_texts = set()
        unique_indices = []
        unique_scores = []
        for sim_score, row_idx in zip(D[0], I[0]):
            if row_idx >= len(rows):
                continue
            txt = rows[row_idx][1][:100]  # key by first 100 chars
            if txt not in seen_texts and sim_score >= 0.50:
                seen_texts.add(txt)
                unique_indices.append(row_idx)
                unique_scores.append(sim_score)
            if len(unique_indices) >= 5:
                break
        evidence_list = []
        for i, idx in enumerate(unique_indices):
            # rows[idx] contains (id, text, source, embedding_json)
            evidence_list.append({
                "text": rows[idx][1],
                "source": rows[idx][2],
                "similarity": float(unique_scores[i])
            })

        # Build NLI results (track similarity index for weighted voting)
        nli_results = []
        for i, idx in enumerate(unique_indices):
            evidence_text = rows[idx][1]
            sim_weight = float(unique_scores[i])   # FAISS cosine similarity
            try:
                def get_core_claim(c):
                    """Strip trailing prepositional qualifiers like 'in 2024', 'currently'
                    that confuse literal NLI matching — but NOT location qualifiers that
                    are part of the claim's meaning (e.g. 'at sea level')."""
                    import re
                    stripped = re.sub(
                        r'\s+(in\s+\d{4}|since\s+\w+|currently|right now|nowadays|as of \w+)$',
                        '', c.strip(), flags=re.IGNORECASE
                    )
                    return stripped if stripped != c else c

                # Run NLI with the raw claim — this is always the primary result
                r1 = nli_model(evidence_text, text_pair=claim)
                label1 = r1[0].get("label", "neutral")
                score1 = float(r1[0].get("score", 0.0))

                # Only try the simplified core-claim if the raw result is neutral
                # (prevents stripping from flipping a correct entailment to contradiction)
                if label1 == "neutral":
                    core = get_core_claim(claim)
                    if core != claim:
                        r2 = nli_model(evidence_text, text_pair=core)
                        label2 = r2[0].get("label", "neutral")
                        score2 = float(r2[0].get("score", 0.0))
                        if label2 != "neutral" and score2 > score1:
                            label1, score1 = label2, score2

                nli_results.append({
                    "evidence":   evidence_text[:200],
                    "label":      label1,
                    "score":      score1,
                    "similarity": sim_weight
                })
            except Exception as e:
                print(f"[WARNING] NLI error: {e}")


        # ── Similarity-Weighted Verdict ───────────────────────────────────────
        # Uses the strongest evidence to avoid high-quality sources being 
        # outvoted by a higher quantity of lower-quality noisy sources.
        verdict    = "Uncertain"
        confidence = 0.0

        if nli_results:
            best_entail = max(
                ([r['score'] * r['similarity'] for r in nli_results if 'entail' in r['label'].lower()] + [0.0])
            )
            best_contra = max(
                ([r['score'] * r['similarity'] for r in nli_results if 'contradict' in r['label'].lower()] + [0.0])
            )

            print(f"[Verdict] best entail={best_entail:.3f}  contra={best_contra:.3f}")

            if best_entail > best_contra and best_entail >= 0.20:
                verdict    = "True"
                confidence = best_entail
            elif best_contra > best_entail and best_contra >= 0.20:
                verdict    = "False"
                confidence = best_contra
            else:
                verdict    = "Mixture/Uncertain"
                confidence = max(best_entail, best_contra)

        return {
            "success": True,
            "claim": claim,
            "verdict": verdict,
            "confidence": round(confidence, 2),
            "evidence": evidence_list,
            "nli_results": nli_results,
            "total_evidence": len(evidence_list)
        }
        
    except ImportError as e:
        print(f"DEBUG: ImportError in api_wrapper: {e}")
        # Return demo data if dependencies are missing
        return {
            "success": True,
            "claim": claim,
            "evidence": [
                {
                    "text": "This is demo evidence from RSS feed. Install dependencies from requirements.txt for real fact-checking.",
                    "source": "RSS",
                    "similarity": 0.85
                },
                {
                    "text": "This is demo evidence from GDELT. The full system searches multiple news sources and databases.",
                    "source": "GDELT",
                    "similarity": 0.78
                },
                {
                    "text": "This is demo evidence from Wikipedia. Install all dependencies to enable real-time fact verification.",
                    "source": "Wikipedia",
                    "similarity": 0.72
                }
            ],
            "nli_results": [
                {
                    "evidence": "Demo evidence showing entailment (supports the claim)",
                    "label": "entailment",
                    "score": 0.89
                },
                {
                    "evidence": "Demo evidence showing neutral stance",
                    "label": "neutral",
                    "score": 0.65
                },
                {
                    "evidence": "Demo evidence showing contradiction",
                    "label": "contradiction",
                    "score": 0.45
                }
            ],
            "total_evidence": 3
        }

    except Exception as e:
        print(f"DEBUG: General Exception in api_wrapper: {e}")
        import traceback
        traceback.print_exc()
        return {
            "success": False,
            "error": str(e),
            "evidence": [],
            "nli_results": []
        }