File size: 11,071 Bytes
e6d7e29
 
 
4f48a4e
e6d7e29
 
 
 
 
 
 
 
 
 
 
 
 
4f48a4e
 
e6d7e29
 
 
 
 
4f48a4e
e6d7e29
 
 
4f48a4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6d7e29
 
4f48a4e
e6d7e29
 
 
 
 
 
 
 
 
 
 
4f48a4e
e6d7e29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f48a4e
 
 
e6d7e29
 
 
4f48a4e
 
 
 
 
 
 
 
 
 
 
 
 
 
e6d7e29
4f48a4e
 
 
 
 
 
 
 
 
e6d7e29
4f48a4e
e6d7e29
4f48a4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6d7e29
4f48a4e
 
 
 
e6d7e29
4f48a4e
 
 
e6d7e29
4f48a4e
 
 
 
e6d7e29
 
 
4f48a4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6d7e29
4f48a4e
 
e6d7e29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# ==========================================
# API WRAPPER FOR FLASK
# ==========================================
from project.database import get_total_evidence_count, load_all_evidence

def run_fact_check_api(claim):
    """
    API-friendly version that returns structured data instead of printing.
    Returns dict with evidence, NLI results, and metadata.
    
    Note: This is a simplified version for demo. For full functionality,
    install all dependencies from requirements.txt
    """
    try:
        # Try to import the model
        from model import (
            init_db, clear_db, embed_model, fetch_rss, fetch_gdelt, fetch_newsapi,
            fetch_wikipedia, fetch_duckduckgo, fetch_knowledge_base, fetch_wikidata,
            build_faiss, load_all_evidence, nli_model, FAISS_FILE
        )
        import faiss
        
        # Full implementation
        init_db()
        # clear_db() - Removed to allow accumulation of facts

        claim_emb = embed_model.encode([claim], normalize_embeddings=True)

        # 1. Static knowledge base (offline, always runs first)
        kb_count = fetch_knowledge_base(claim, claim_emb)

        # ── Quick KB short-circuit ──────────────────────────────────────
        # If KB already found strong matches, build a temporary FAISS and
        # check the best similarity score. If it's high (β‰₯ 0.65) we have
        # enough reliable evidence β€” skip the slow live fetches entirely.
        kb_short_circuit = False
        if kb_count >= 1:
            if build_faiss():
                _idx = faiss.read_index(FAISS_FILE)
                _D, _ = _idx.search(claim_emb, 1)
                if len(_D[0]) > 0 and _D[0][0] >= 0.65:
                    kb_short_circuit = True
                    print(f"[KnowledgeBase] Strong match (score={_D[0][0]:.2f}) β€” skipping live fetches.")
        # ───────────────────────────────────────────────────────────────

        # 2. Wikidata entity search (fast, no API key β€” always runs)
        fetch_wikidata(claim, claim_emb)

        # ── Database Evidence Search (Vector Cache) ───────────────────
        # Before doing slow live scraping, check if our database already has
        # highly relevant evidence from previous fact-checks of similar topics.
        local_evidence_found = False
        if not kb_short_circuit and build_faiss():
            _idx = faiss.read_index(FAISS_FILE)
            if _idx.ntotal > 0:
                _D, _ = _idx.search(claim_emb, 1)
                if len(_D[0]) > 0 and _D[0][0] >= 0.60:
                    local_evidence_found = True
                    print(f"[VectorCache] Strong local evidence found (score={_D[0][0]:.2f}) β€” skipping live scrapes.")
        # ───────────────────────────────────────────────────────────────

        # 3. Live fetches β€” skipped when KB or local DB already has strong matches
        gdelt_count = 0
        newsapi_count = 0
        if not kb_short_circuit and not local_evidence_found:
            fetch_rss(claim_emb)
            gdelt_count = fetch_gdelt(claim, claim_emb)
            newsapi_count = fetch_newsapi(claim, claim_emb)
            fetch_wikipedia(claim)


        # Count evidence
        total_count = get_total_evidence_count()

        activate_fallback = False

        if (gdelt_count + newsapi_count) == 0 or total_count < 3:
            activate_fallback = True

        faiss_ready = build_faiss()

        if faiss_ready:
            index = faiss.read_index(FAISS_FILE)
            D, _ = index.search(claim_emb, 1)
            if len(D) > 0 and len(D[0]) > 0 and D[0][0] < 0.50:
                activate_fallback = True

        if activate_fallback:
            fetch_duckduckgo(claim, claim_emb)
            faiss_ready = build_faiss()

        if not faiss_ready:
            return {
                "success": False,
                "error": "No relevant evidence found.",
                "evidence": [],
                "nli_results": []
            }

        index = faiss.read_index(FAISS_FILE)
        # Search wider first (10 items), then de-duplicate
        top_k = min(10, index.ntotal)
        D, I = index.search(claim_emb, top_k)

        rows = load_all_evidence()

        # De-duplicate by text content and apply minimum similarity threshold
        seen_texts = set()
        unique_indices = []
        unique_scores = []
        for sim_score, row_idx in zip(D[0], I[0]):
            if row_idx >= len(rows):
                continue
            txt = rows[row_idx][1][:100]  # key by first 100 chars
            if txt not in seen_texts and sim_score >= 0.50:
                seen_texts.add(txt)
                unique_indices.append(row_idx)
                unique_scores.append(sim_score)
            if len(unique_indices) >= 5:
                break
        evidence_list = []
        for i, idx in enumerate(unique_indices):
            # rows[idx] contains (id, text, source, embedding_json)
            evidence_list.append({
                "text": rows[idx][1],
                "source": rows[idx][2],
                "similarity": float(unique_scores[i])
            })

        # Build NLI results (track similarity index for weighted voting)
        nli_results = []
        for i, idx in enumerate(unique_indices):
            evidence_text = rows[idx][1]
            sim_weight = float(unique_scores[i])   # FAISS cosine similarity
            try:
                def get_core_claim(c):
                    """Strip trailing prepositional qualifiers like 'in 2024', 'currently'
                    that confuse literal NLI matching β€” but NOT location qualifiers that
                    are part of the claim's meaning (e.g. 'at sea level')."""
                    import re
                    stripped = re.sub(
                        r'\s+(in\s+\d{4}|since\s+\w+|currently|right now|nowadays|as of \w+)$',
                        '', c.strip(), flags=re.IGNORECASE
                    )
                    return stripped if stripped != c else c

                # Run NLI with the raw claim β€” this is always the primary result
                r1 = nli_model(evidence_text, text_pair=claim)
                label1 = r1[0].get("label", "neutral")
                score1 = float(r1[0].get("score", 0.0))

                # Only try the simplified core-claim if the raw result is neutral
                # (prevents stripping from flipping a correct entailment to contradiction)
                if label1 == "neutral":
                    core = get_core_claim(claim)
                    if core != claim:
                        r2 = nli_model(evidence_text, text_pair=core)
                        label2 = r2[0].get("label", "neutral")
                        score2 = float(r2[0].get("score", 0.0))
                        if label2 != "neutral" and score2 > score1:
                            label1, score1 = label2, score2

                nli_results.append({
                    "evidence":   evidence_text[:200],
                    "label":      label1,
                    "score":      score1,
                    "similarity": sim_weight
                })
            except Exception as e:
                print(f"[WARNING] NLI error: {e}")


        # ── Similarity-Weighted Verdict ───────────────────────────────────────
        # Uses the strongest evidence to avoid high-quality sources being 
        # outvoted by a higher quantity of lower-quality noisy sources.
        verdict    = "Uncertain"
        confidence = 0.0

        if nli_results:
            best_entail = max(
                ([r['score'] * r['similarity'] for r in nli_results if 'entail' in r['label'].lower()] + [0.0])
            )
            best_contra = max(
                ([r['score'] * r['similarity'] for r in nli_results if 'contradict' in r['label'].lower()] + [0.0])
            )

            print(f"[Verdict] best entail={best_entail:.3f}  contra={best_contra:.3f}")

            if best_entail > best_contra and best_entail >= 0.20:
                verdict    = "True"
                confidence = best_entail
            elif best_contra > best_entail and best_contra >= 0.20:
                verdict    = "False"
                confidence = best_contra
            else:
                verdict    = "Mixture/Uncertain"
                confidence = max(best_entail, best_contra)

        return {
            "success": True,
            "claim": claim,
            "verdict": verdict,
            "confidence": round(confidence, 2),
            "evidence": evidence_list,
            "nli_results": nli_results,
            "total_evidence": len(evidence_list)
        }
        
    except ImportError as e:
        print(f"DEBUG: ImportError in api_wrapper: {e}")
        # Return demo data if dependencies are missing
        return {
            "success": True,
            "claim": claim,
            "evidence": [
                {
                    "text": "This is demo evidence from RSS feed. Install dependencies from requirements.txt for real fact-checking.",
                    "source": "RSS",
                    "similarity": 0.85
                },
                {
                    "text": "This is demo evidence from GDELT. The full system searches multiple news sources and databases.",
                    "source": "GDELT",
                    "similarity": 0.78
                },
                {
                    "text": "This is demo evidence from Wikipedia. Install all dependencies to enable real-time fact verification.",
                    "source": "Wikipedia",
                    "similarity": 0.72
                }
            ],
            "nli_results": [
                {
                    "evidence": "Demo evidence showing entailment (supports the claim)",
                    "label": "entailment",
                    "score": 0.89
                },
                {
                    "evidence": "Demo evidence showing neutral stance",
                    "label": "neutral",
                    "score": 0.65
                },
                {
                    "evidence": "Demo evidence showing contradiction",
                    "label": "contradiction",
                    "score": 0.45
                }
            ],
            "total_evidence": 3
        }

    except Exception as e:
        print(f"DEBUG: General Exception in api_wrapper: {e}")
        import traceback
        traceback.print_exc()
        return {
            "success": False,
            "error": str(e),
            "evidence": [],
            "nli_results": []
        }