Spaces:
Running
Running
| # ========================================== | |
| # API WRAPPER FOR FLASK | |
| # ========================================== | |
| from project.database import get_total_evidence_count, load_all_evidence | |
| def run_fact_check_api(claim): | |
| """ | |
| API-friendly version that returns structured data instead of printing. | |
| Returns dict with evidence, NLI results, and metadata. | |
| Note: This is a simplified version for demo. For full functionality, | |
| install all dependencies from requirements.txt | |
| """ | |
| try: | |
| # Try to import the model | |
| from model import ( | |
| init_db, clear_db, embed_model, fetch_rss, fetch_gdelt, fetch_newsapi, | |
| fetch_wikipedia, fetch_duckduckgo, fetch_knowledge_base, fetch_wikidata, | |
| build_faiss, load_all_evidence, nli_model, FAISS_FILE | |
| ) | |
| import faiss | |
| # Full implementation | |
| init_db() | |
| # clear_db() - Removed to allow accumulation of facts | |
| claim_emb = embed_model.encode([claim], normalize_embeddings=True) | |
| # 1. Static knowledge base (offline, always runs first) | |
| kb_count = fetch_knowledge_base(claim, claim_emb) | |
| # ββ Quick KB short-circuit ββββββββββββββββββββββββββββββββββββββ | |
| # If KB already found strong matches, build a temporary FAISS and | |
| # check the best similarity score. If it's high (β₯ 0.65) we have | |
| # enough reliable evidence β skip the slow live fetches entirely. | |
| kb_short_circuit = False | |
| if kb_count >= 1: | |
| if build_faiss(): | |
| _idx = faiss.read_index(FAISS_FILE) | |
| _D, _ = _idx.search(claim_emb, 1) | |
| if len(_D[0]) > 0 and _D[0][0] >= 0.65: | |
| kb_short_circuit = True | |
| print(f"[KnowledgeBase] Strong match (score={_D[0][0]:.2f}) β skipping live fetches.") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. Wikidata entity search (fast, no API key β always runs) | |
| fetch_wikidata(claim, claim_emb) | |
| # ββ Database Evidence Search (Vector Cache) βββββββββββββββββββ | |
| # Before doing slow live scraping, check if our database already has | |
| # highly relevant evidence from previous fact-checks of similar topics. | |
| local_evidence_found = False | |
| if not kb_short_circuit and build_faiss(): | |
| _idx = faiss.read_index(FAISS_FILE) | |
| if _idx.ntotal > 0: | |
| _D, _ = _idx.search(claim_emb, 1) | |
| if len(_D[0]) > 0 and _D[0][0] >= 0.60: | |
| local_evidence_found = True | |
| print(f"[VectorCache] Strong local evidence found (score={_D[0][0]:.2f}) β skipping live scrapes.") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. Live fetches β skipped when KB or local DB already has strong matches | |
| gdelt_count = 0 | |
| newsapi_count = 0 | |
| if not kb_short_circuit and not local_evidence_found: | |
| fetch_rss(claim_emb) | |
| gdelt_count = fetch_gdelt(claim, claim_emb) | |
| newsapi_count = fetch_newsapi(claim, claim_emb) | |
| fetch_wikipedia(claim) | |
| # Count evidence | |
| total_count = get_total_evidence_count() | |
| activate_fallback = False | |
| if (gdelt_count + newsapi_count) == 0 or total_count < 3: | |
| activate_fallback = True | |
| faiss_ready = build_faiss() | |
| if faiss_ready: | |
| index = faiss.read_index(FAISS_FILE) | |
| D, _ = index.search(claim_emb, 1) | |
| if len(D) > 0 and len(D[0]) > 0 and D[0][0] < 0.50: | |
| activate_fallback = True | |
| if activate_fallback: | |
| fetch_duckduckgo(claim, claim_emb) | |
| faiss_ready = build_faiss() | |
| if not faiss_ready: | |
| return { | |
| "success": False, | |
| "error": "No relevant evidence found.", | |
| "evidence": [], | |
| "nli_results": [] | |
| } | |
| index = faiss.read_index(FAISS_FILE) | |
| # Search wider first (10 items), then de-duplicate | |
| top_k = min(10, index.ntotal) | |
| D, I = index.search(claim_emb, top_k) | |
| rows = load_all_evidence() | |
| # De-duplicate by text content and apply minimum similarity threshold | |
| seen_texts = set() | |
| unique_indices = [] | |
| unique_scores = [] | |
| for sim_score, row_idx in zip(D[0], I[0]): | |
| if row_idx >= len(rows): | |
| continue | |
| txt = rows[row_idx][1][:100] # key by first 100 chars | |
| if txt not in seen_texts and sim_score >= 0.50: | |
| seen_texts.add(txt) | |
| unique_indices.append(row_idx) | |
| unique_scores.append(sim_score) | |
| if len(unique_indices) >= 5: | |
| break | |
| evidence_list = [] | |
| for i, idx in enumerate(unique_indices): | |
| # rows[idx] contains (id, text, source, embedding_json) | |
| evidence_list.append({ | |
| "text": rows[idx][1], | |
| "source": rows[idx][2], | |
| "similarity": float(unique_scores[i]) | |
| }) | |
| # Build NLI results (track similarity index for weighted voting) | |
| nli_results = [] | |
| for i, idx in enumerate(unique_indices): | |
| evidence_text = rows[idx][1] | |
| sim_weight = float(unique_scores[i]) # FAISS cosine similarity | |
| try: | |
| def get_core_claim(c): | |
| """Strip trailing prepositional qualifiers like 'in 2024', 'currently' | |
| that confuse literal NLI matching β but NOT location qualifiers that | |
| are part of the claim's meaning (e.g. 'at sea level').""" | |
| import re | |
| stripped = re.sub( | |
| r'\s+(in\s+\d{4}|since\s+\w+|currently|right now|nowadays|as of \w+)$', | |
| '', c.strip(), flags=re.IGNORECASE | |
| ) | |
| return stripped if stripped != c else c | |
| # Run NLI with the raw claim β this is always the primary result | |
| r1 = nli_model(evidence_text, text_pair=claim) | |
| label1 = r1[0].get("label", "neutral") | |
| score1 = float(r1[0].get("score", 0.0)) | |
| # Only try the simplified core-claim if the raw result is neutral | |
| # (prevents stripping from flipping a correct entailment to contradiction) | |
| if label1 == "neutral": | |
| core = get_core_claim(claim) | |
| if core != claim: | |
| r2 = nli_model(evidence_text, text_pair=core) | |
| label2 = r2[0].get("label", "neutral") | |
| score2 = float(r2[0].get("score", 0.0)) | |
| if label2 != "neutral" and score2 > score1: | |
| label1, score1 = label2, score2 | |
| nli_results.append({ | |
| "evidence": evidence_text[:200], | |
| "label": label1, | |
| "score": score1, | |
| "similarity": sim_weight | |
| }) | |
| except Exception as e: | |
| print(f"[WARNING] NLI error: {e}") | |
| # ββ Similarity-Weighted Verdict βββββββββββββββββββββββββββββββββββββββ | |
| # Uses the strongest evidence to avoid high-quality sources being | |
| # outvoted by a higher quantity of lower-quality noisy sources. | |
| verdict = "Uncertain" | |
| confidence = 0.0 | |
| if nli_results: | |
| best_entail = max( | |
| ([r['score'] * r['similarity'] for r in nli_results if 'entail' in r['label'].lower()] + [0.0]) | |
| ) | |
| best_contra = max( | |
| ([r['score'] * r['similarity'] for r in nli_results if 'contradict' in r['label'].lower()] + [0.0]) | |
| ) | |
| print(f"[Verdict] best entail={best_entail:.3f} contra={best_contra:.3f}") | |
| if best_entail > best_contra and best_entail >= 0.20: | |
| verdict = "True" | |
| confidence = best_entail | |
| elif best_contra > best_entail and best_contra >= 0.20: | |
| verdict = "False" | |
| confidence = best_contra | |
| else: | |
| verdict = "Mixture/Uncertain" | |
| confidence = max(best_entail, best_contra) | |
| return { | |
| "success": True, | |
| "claim": claim, | |
| "verdict": verdict, | |
| "confidence": round(confidence, 2), | |
| "evidence": evidence_list, | |
| "nli_results": nli_results, | |
| "total_evidence": len(evidence_list) | |
| } | |
| except ImportError as e: | |
| print(f"DEBUG: ImportError in api_wrapper: {e}") | |
| # Return demo data if dependencies are missing | |
| return { | |
| "success": True, | |
| "claim": claim, | |
| "evidence": [ | |
| { | |
| "text": "This is demo evidence from RSS feed. Install dependencies from requirements.txt for real fact-checking.", | |
| "source": "RSS", | |
| "similarity": 0.85 | |
| }, | |
| { | |
| "text": "This is demo evidence from GDELT. The full system searches multiple news sources and databases.", | |
| "source": "GDELT", | |
| "similarity": 0.78 | |
| }, | |
| { | |
| "text": "This is demo evidence from Wikipedia. Install all dependencies to enable real-time fact verification.", | |
| "source": "Wikipedia", | |
| "similarity": 0.72 | |
| } | |
| ], | |
| "nli_results": [ | |
| { | |
| "evidence": "Demo evidence showing entailment (supports the claim)", | |
| "label": "entailment", | |
| "score": 0.89 | |
| }, | |
| { | |
| "evidence": "Demo evidence showing neutral stance", | |
| "label": "neutral", | |
| "score": 0.65 | |
| }, | |
| { | |
| "evidence": "Demo evidence showing contradiction", | |
| "label": "contradiction", | |
| "score": 0.45 | |
| } | |
| ], | |
| "total_evidence": 3 | |
| } | |
| except Exception as e: | |
| print(f"DEBUG: General Exception in api_wrapper: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return { | |
| "success": False, | |
| "error": str(e), | |
| "evidence": [], | |
| "nli_results": [] | |
| } | |