# ========================================== # API WRAPPER FOR FLASK # ========================================== from project.database import get_total_evidence_count, load_all_evidence def run_fact_check_api(claim): """ API-friendly version that returns structured data instead of printing. Returns dict with evidence, NLI results, and metadata. Note: This is a simplified version for demo. For full functionality, install all dependencies from requirements.txt """ try: # Try to import the model from model import ( init_db, clear_db, embed_model, fetch_rss, fetch_gdelt, fetch_newsapi, fetch_wikipedia, fetch_duckduckgo, fetch_knowledge_base, fetch_wikidata, build_faiss, load_all_evidence, nli_model, FAISS_FILE ) import faiss # Full implementation init_db() # clear_db() - Removed to allow accumulation of facts claim_emb = embed_model.encode([claim], normalize_embeddings=True) # 1. Static knowledge base (offline, always runs first) kb_count = fetch_knowledge_base(claim, claim_emb) # ── Quick KB short-circuit ────────────────────────────────────── # If KB already found strong matches, build a temporary FAISS and # check the best similarity score. If it's high (≥ 0.65) we have # enough reliable evidence — skip the slow live fetches entirely. kb_short_circuit = False if kb_count >= 1: if build_faiss(): _idx = faiss.read_index(FAISS_FILE) _D, _ = _idx.search(claim_emb, 1) if len(_D[0]) > 0 and _D[0][0] >= 0.65: kb_short_circuit = True print(f"[KnowledgeBase] Strong match (score={_D[0][0]:.2f}) — skipping live fetches.") # ─────────────────────────────────────────────────────────────── # 2. Wikidata entity search (fast, no API key — always runs) fetch_wikidata(claim, claim_emb) # ── Database Evidence Search (Vector Cache) ─────────────────── # Before doing slow live scraping, check if our database already has # highly relevant evidence from previous fact-checks of similar topics. local_evidence_found = False if not kb_short_circuit and build_faiss(): _idx = faiss.read_index(FAISS_FILE) if _idx.ntotal > 0: _D, _ = _idx.search(claim_emb, 1) if len(_D[0]) > 0 and _D[0][0] >= 0.60: local_evidence_found = True print(f"[VectorCache] Strong local evidence found (score={_D[0][0]:.2f}) — skipping live scrapes.") # ─────────────────────────────────────────────────────────────── # 3. Live fetches — skipped when KB or local DB already has strong matches gdelt_count = 0 newsapi_count = 0 if not kb_short_circuit and not local_evidence_found: fetch_rss(claim_emb) gdelt_count = fetch_gdelt(claim, claim_emb) newsapi_count = fetch_newsapi(claim, claim_emb) fetch_wikipedia(claim) # Count evidence total_count = get_total_evidence_count() activate_fallback = False if (gdelt_count + newsapi_count) == 0 or total_count < 3: activate_fallback = True faiss_ready = build_faiss() if faiss_ready: index = faiss.read_index(FAISS_FILE) D, _ = index.search(claim_emb, 1) if len(D) > 0 and len(D[0]) > 0 and D[0][0] < 0.50: activate_fallback = True if activate_fallback: fetch_duckduckgo(claim, claim_emb) faiss_ready = build_faiss() if not faiss_ready: return { "success": False, "error": "No relevant evidence found.", "evidence": [], "nli_results": [] } index = faiss.read_index(FAISS_FILE) # Search wider first (10 items), then de-duplicate top_k = min(10, index.ntotal) D, I = index.search(claim_emb, top_k) rows = load_all_evidence() # De-duplicate by text content and apply minimum similarity threshold seen_texts = set() unique_indices = [] unique_scores = [] for sim_score, row_idx in zip(D[0], I[0]): if row_idx >= len(rows): continue txt = rows[row_idx][1][:100] # key by first 100 chars if txt not in seen_texts and sim_score >= 0.50: seen_texts.add(txt) unique_indices.append(row_idx) unique_scores.append(sim_score) if len(unique_indices) >= 5: break evidence_list = [] for i, idx in enumerate(unique_indices): # rows[idx] contains (id, text, source, embedding_json) evidence_list.append({ "text": rows[idx][1], "source": rows[idx][2], "similarity": float(unique_scores[i]) }) # Build NLI results (track similarity index for weighted voting) nli_results = [] for i, idx in enumerate(unique_indices): evidence_text = rows[idx][1] sim_weight = float(unique_scores[i]) # FAISS cosine similarity try: def get_core_claim(c): """Strip trailing prepositional qualifiers like 'in 2024', 'currently' that confuse literal NLI matching — but NOT location qualifiers that are part of the claim's meaning (e.g. 'at sea level').""" import re stripped = re.sub( r'\s+(in\s+\d{4}|since\s+\w+|currently|right now|nowadays|as of \w+)$', '', c.strip(), flags=re.IGNORECASE ) return stripped if stripped != c else c # Run NLI with the raw claim — this is always the primary result r1 = nli_model(evidence_text, text_pair=claim) label1 = r1[0].get("label", "neutral") score1 = float(r1[0].get("score", 0.0)) # Only try the simplified core-claim if the raw result is neutral # (prevents stripping from flipping a correct entailment to contradiction) if label1 == "neutral": core = get_core_claim(claim) if core != claim: r2 = nli_model(evidence_text, text_pair=core) label2 = r2[0].get("label", "neutral") score2 = float(r2[0].get("score", 0.0)) if label2 != "neutral" and score2 > score1: label1, score1 = label2, score2 nli_results.append({ "evidence": evidence_text[:200], "label": label1, "score": score1, "similarity": sim_weight }) except Exception as e: print(f"[WARNING] NLI error: {e}") # ── Similarity-Weighted Verdict ─────────────────────────────────────── # Uses the strongest evidence to avoid high-quality sources being # outvoted by a higher quantity of lower-quality noisy sources. verdict = "Uncertain" confidence = 0.0 if nli_results: best_entail = max( ([r['score'] * r['similarity'] for r in nli_results if 'entail' in r['label'].lower()] + [0.0]) ) best_contra = max( ([r['score'] * r['similarity'] for r in nli_results if 'contradict' in r['label'].lower()] + [0.0]) ) print(f"[Verdict] best entail={best_entail:.3f} contra={best_contra:.3f}") if best_entail > best_contra and best_entail >= 0.20: verdict = "True" confidence = best_entail elif best_contra > best_entail and best_contra >= 0.20: verdict = "False" confidence = best_contra else: verdict = "Mixture/Uncertain" confidence = max(best_entail, best_contra) return { "success": True, "claim": claim, "verdict": verdict, "confidence": round(confidence, 2), "evidence": evidence_list, "nli_results": nli_results, "total_evidence": len(evidence_list) } except ImportError as e: print(f"DEBUG: ImportError in api_wrapper: {e}") # Return demo data if dependencies are missing return { "success": True, "claim": claim, "evidence": [ { "text": "This is demo evidence from RSS feed. Install dependencies from requirements.txt for real fact-checking.", "source": "RSS", "similarity": 0.85 }, { "text": "This is demo evidence from GDELT. The full system searches multiple news sources and databases.", "source": "GDELT", "similarity": 0.78 }, { "text": "This is demo evidence from Wikipedia. Install all dependencies to enable real-time fact verification.", "source": "Wikipedia", "similarity": 0.72 } ], "nli_results": [ { "evidence": "Demo evidence showing entailment (supports the claim)", "label": "entailment", "score": 0.89 }, { "evidence": "Demo evidence showing neutral stance", "label": "neutral", "score": 0.65 }, { "evidence": "Demo evidence showing contradiction", "label": "contradiction", "score": 0.45 } ], "total_evidence": 3 } except Exception as e: print(f"DEBUG: General Exception in api_wrapper: {e}") import traceback traceback.print_exc() return { "success": False, "error": str(e), "evidence": [], "nli_results": [] }