adi-123 commited on
Commit
a3075d5
·
verified ·
1 Parent(s): 45547cc

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +885 -0
utils.py ADDED
@@ -0,0 +1,885 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import List, Dict, Any
4
+ from dotenv import load_dotenv
5
+ from langchain.schema import Document as LangchainDocument
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain_together.chat_models import ChatTogether
8
+ from langchain_together.embeddings import TogetherEmbeddings
9
+ import spacy
10
+ import pandas as pd
11
+ import json
12
+ import re
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
18
+ handlers=[
19
+ logging.FileHandler('fact_checker.log'),
20
+ logging.StreamHandler()
21
+ ]
22
+ )
23
+ logger = logging.getLogger(__name__)
24
+
25
+ load_dotenv()
26
+ logger.info("Environment variables loaded")
27
+
28
+ # ---------- API Key Helper -------------------------------------------------
29
+ def get_together_api_key() -> str:
30
+ """Get Together AI API key from environment variables."""
31
+ try:
32
+ key = os.getenv("TOGETHER_API_KEY")
33
+ if key:
34
+ logger.info("Together AI API key found")
35
+ return key
36
+
37
+ # If not found, raise error
38
+ error_msg = (
39
+ "TOGETHER_API_KEY not found. Please set it in one of these ways:\n"
40
+ "1. Create a .env file with: TOGETHER_API_KEY=your_key_here\n"
41
+ "2. Set environment variable: export TOGETHER_API_KEY=your_key_here"
42
+ )
43
+ logger.error(error_msg)
44
+ raise EnvironmentError(error_msg)
45
+ except Exception as e:
46
+ logger.exception("Error retrieving Together AI API key")
47
+ raise
48
+
49
+
50
+ # ========================================================================
51
+ # FACT-CHECKING SYSTEM COMPONENTS (OOP Architecture)
52
+ # ========================================================================
53
+
54
+ class ClaimExtractor:
55
+ """
56
+ Handles claim and entity extraction using NLP (spaCy).
57
+ Follows Single Responsibility Principle.
58
+ """
59
+
60
+ # Supported entity types for extraction
61
+ ENTITY_TYPES = ['ORG', 'GPE', 'PERSON', 'DATE', 'EVENT', 'MONEY',
62
+ 'PERCENT', 'LAW', 'PRODUCT']
63
+
64
+ def __init__(self, model_name: str = "en_core_web_sm"):
65
+ """
66
+ Initialize the ClaimExtractor with a spaCy model.
67
+
68
+ Args:
69
+ model_name: Name of the spaCy model to use
70
+ """
71
+ self.model_name = model_name
72
+ self._nlp = None
73
+
74
+ @property
75
+ def nlp(self):
76
+ """Lazy load spaCy model to avoid startup overhead."""
77
+ if self._nlp is None:
78
+ try:
79
+ logger.info(f"Loading spaCy model: {self.model_name}")
80
+ self._nlp = spacy.load(self.model_name)
81
+ logger.info(f"Successfully loaded spaCy model: {self.model_name}")
82
+ except OSError as e:
83
+ logger.error(f"spaCy model '{self.model_name}' not found")
84
+ raise RuntimeError(
85
+ f"spaCy model '{self.model_name}' not found. "
86
+ f"Please install it with: python -m spacy download {self.model_name}"
87
+ )
88
+ except Exception as e:
89
+ logger.exception(f"Unexpected error loading spaCy model: {self.model_name}")
90
+ raise
91
+ return self._nlp
92
+
93
+ def extract_entities(self, doc) -> List[Dict[str, Any]]:
94
+ """
95
+ Extract named entities from a spaCy document.
96
+
97
+ Args:
98
+ doc: spaCy document object
99
+
100
+ Returns:
101
+ List of entity dictionaries with text, type, and position
102
+ """
103
+ try:
104
+ entities = []
105
+ for ent in doc.ents:
106
+ if ent.label_ in self.ENTITY_TYPES:
107
+ entities.append({
108
+ 'text': ent.text,
109
+ 'type': ent.label_,
110
+ 'start': ent.start_char,
111
+ 'end': ent.end_char
112
+ })
113
+ logger.debug(f"Extracted {len(entities)} entities")
114
+ return entities
115
+ except Exception as e:
116
+ logger.exception("Error extracting entities")
117
+ return []
118
+
119
+ def extract_claims(self, text: str, min_length: int = 10) -> List[Dict[str, Any]]:
120
+ """
121
+ Extract key claims and named entities from input text.
122
+
123
+ Args:
124
+ text: Input text (e.g., news post, social media statement)
125
+ min_length: Minimum length for a sentence to be considered a claim
126
+
127
+ Returns:
128
+ List of claim dictionaries with 'text', 'type', and 'entities'
129
+ """
130
+ try:
131
+ logger.info(f"Extracting claims from text ({len(text)} chars)")
132
+ doc = self.nlp(text)
133
+ entities = self.extract_entities(doc)
134
+
135
+ # Extract sentences as potential claims
136
+ claims = []
137
+ for sent in doc.sents:
138
+ sent_text = sent.text.strip()
139
+ if len(sent_text) >= min_length:
140
+ # Find entities in this sentence
141
+ sent_entities = [
142
+ e for e in entities
143
+ if e['start'] >= sent.start_char and e['end'] <= sent.end_char
144
+ ]
145
+
146
+ claims.append({
147
+ 'text': sent_text,
148
+ 'type': 'statement',
149
+ 'entities': sent_entities
150
+ })
151
+
152
+ # If no claims extracted, treat entire text as one claim
153
+ if not claims:
154
+ logger.debug("No sentences found, using entire text as claim")
155
+ claims.append({
156
+ 'text': text.strip(),
157
+ 'type': 'statement',
158
+ 'entities': entities
159
+ })
160
+
161
+ logger.info(f"Extracted {len(claims)} claim(s)")
162
+ return claims
163
+ except Exception as e:
164
+ logger.exception("Error extracting claims")
165
+ # Return fallback claim
166
+ return [{
167
+ 'text': text.strip(),
168
+ 'type': 'statement',
169
+ 'entities': []
170
+ }]
171
+
172
+
173
+ class FactsDatabase:
174
+ """
175
+ Manages the verified facts database and vector store.
176
+ Handles loading, embedding, and persistence.
177
+ """
178
+
179
+ DEFAULT_CSV_PATH = "verified_facts_db.csv"
180
+ DEFAULT_INDEX_PATH = "faiss_index_facts"
181
+ EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"
182
+
183
+ def __init__(self, api_key: str = None):
184
+ """
185
+ Initialize the FactsDatabase.
186
+
187
+ Args:
188
+ api_key: Together AI API key (optional, can use get_together_api_key)
189
+ """
190
+ logger.info("Initializing FactsDatabase")
191
+ self.api_key = api_key or get_together_api_key()
192
+
193
+ try:
194
+ self.embeddings = TogetherEmbeddings(
195
+ model=self.EMBEDDING_MODEL,
196
+ api_key=self.api_key
197
+ )
198
+ logger.info(f"Embeddings initialized with model: {self.EMBEDDING_MODEL}")
199
+
200
+ # Initialize ClaimExtractor for entity extraction from facts
201
+ self.claim_extractor = ClaimExtractor()
202
+ logger.info("ClaimExtractor initialized for database entity extraction")
203
+
204
+ except Exception as e:
205
+ logger.exception("Error initializing embeddings")
206
+ raise
207
+
208
+ def load_from_csv(
209
+ self,
210
+ csv_path: str = None,
211
+ index_path: str = None
212
+ ) -> str:
213
+ """
214
+ Load verified facts from CSV and create FAISS vector store.
215
+
216
+ Args:
217
+ csv_path: Path to verified facts CSV file
218
+ index_path: Path to save FAISS index
219
+
220
+ Returns:
221
+ Status message with count of loaded facts
222
+ """
223
+ csv_path = csv_path or self.DEFAULT_CSV_PATH
224
+ index_path = index_path or self.DEFAULT_INDEX_PATH
225
+
226
+ try:
227
+ logger.info(f"Loading facts from CSV: {csv_path}")
228
+ # Read verified facts
229
+ df = pd.read_csv(csv_path)
230
+ logger.info(f"Loaded {len(df)} rows from CSV")
231
+
232
+ # Handle different CSV formats
233
+ if 'fact_text' in df.columns:
234
+ fact_column = 'fact_text'
235
+ logger.debug("Using 'fact_text' column")
236
+ elif 'fact' in df.columns:
237
+ fact_column = 'fact'
238
+ logger.debug("Using 'fact' column")
239
+ else:
240
+ error_msg = "CSV must contain a 'fact' or 'fact_text' column"
241
+ logger.error(error_msg)
242
+ raise ValueError(error_msg)
243
+
244
+ # Create documents with metadata
245
+ logger.info("Creating documents with metadata")
246
+ documents = self._create_documents(df, fact_column)
247
+ logger.info(f"Created {len(documents)} documents")
248
+
249
+ # Create FAISS index
250
+ logger.info("Creating FAISS vector index...")
251
+ vector_store = FAISS.from_documents(documents, self.embeddings)
252
+ logger.info("FAISS index created successfully")
253
+
254
+ # Save to disk
255
+ logger.info(f"Saving FAISS index to: {index_path}")
256
+ vector_store.save_local(index_path)
257
+ logger.info("FAISS index saved successfully")
258
+
259
+ return f"✅ Successfully loaded {len(documents)} verified facts into vector store"
260
+
261
+ except FileNotFoundError:
262
+ raise FileNotFoundError(f"Verified facts CSV not found at: {csv_path}")
263
+ except Exception as e:
264
+ raise RuntimeError(f"Error loading verified facts: {str(e)}")
265
+
266
+ def _create_documents(
267
+ self,
268
+ df: pd.DataFrame,
269
+ fact_column: str
270
+ ) -> List[LangchainDocument]:
271
+ """
272
+ Create LangChain documents from DataFrame with entity extraction.
273
+
274
+ Args:
275
+ df: Pandas DataFrame with facts
276
+ fact_column: Name of the column containing fact text
277
+
278
+ Returns:
279
+ List of LangChain documents with metadata including extracted entities
280
+ """
281
+ try:
282
+ documents = []
283
+ multi_sentence_count = 0
284
+ pronoun_count = 0
285
+
286
+ for idx, row in df.iterrows():
287
+ fact_text = row[fact_column]
288
+
289
+ # Extract fact_id if available
290
+ if 'fact_id' in df.columns:
291
+ fact_id = row['fact_id']
292
+ else:
293
+ fact_id = f"F{idx:03d}"
294
+
295
+ # DATA VALIDATION: Check for multi-sentence facts
296
+ sentences = fact_text.split('.')
297
+ if len([s for s in sentences if s.strip()]) > 1:
298
+ multi_sentence_count += 1
299
+ logger.warning(
300
+ f"Fact {fact_id} contains multiple sentences ({len(sentences)} sentences). "
301
+ f"Consider splitting for better retrieval: {fact_text[:80]}..."
302
+ )
303
+
304
+ # DATA VALIDATION: Check for unresolved pronouns
305
+ pronouns = ['he ', 'she ', 'it ', 'they ', 'them ', 'his ', 'her ', 'their ']
306
+ if any(pronoun in fact_text.lower() for pronoun in pronouns):
307
+ pronoun_count += 1
308
+ logger.warning(
309
+ f"Fact {fact_id} contains pronouns - may cause coreference issues: {fact_text[:80]}..."
310
+ )
311
+
312
+ # ENTITY EXTRACTION: Extract entities from fact text
313
+ entities = []
314
+ entities_dict = {}
315
+ try:
316
+ claims = self.claim_extractor.extract_claims(fact_text)
317
+ if claims and len(claims) > 0:
318
+ entities = claims[0].get('entities', [])
319
+ # Convert entities list to dict for easier access
320
+ entities_dict = {
321
+ 'organizations': [e['text'] for e in entities if e['type'] in ['ORG', 'ORGANIZATION']],
322
+ 'locations': [e['text'] for e in entities if e['type'] in ['GPE', 'LOC', 'LOCATION']],
323
+ 'persons': [e['text'] for e in entities if e['type'] in ['PERSON', 'PER']],
324
+ 'dates': [e['text'] for e in entities if e['type'] == 'DATE'],
325
+ 'percentages': [e['text'] for e in entities if e['type'] in ['PERCENT', 'PERCENTAGE']],
326
+ 'money': [e['text'] for e in entities if e['type'] in ['MONEY', 'CURRENCY']],
327
+ 'all_entities': [e['text'] for e in entities]
328
+ }
329
+ logger.debug(f"Fact {fact_id}: Extracted {len(entities)} entities")
330
+ except Exception as e:
331
+ logger.warning(f"Failed to extract entities from fact {fact_id}: {str(e)}")
332
+
333
+ # Create metadata with entities
334
+ metadata = {
335
+ 'source': row.get('source', 'Verified Database'),
336
+ 'date': row.get('date', 'N/A'),
337
+ 'category': row.get('category', 'General'),
338
+ 'fact_id': fact_id,
339
+ 'entities': entities, # Full entity list with types
340
+ 'entities_dict': entities_dict # Organized by type for easy filtering
341
+ }
342
+
343
+ # Create LangChain document with metadata
344
+ doc = LangchainDocument(
345
+ page_content=fact_text,
346
+ metadata=metadata
347
+ )
348
+ documents.append(doc)
349
+
350
+ # Summary logging
351
+ logger.info(f"Created {len(documents)} documents from DataFrame")
352
+ if multi_sentence_count > 0:
353
+ logger.warning(
354
+ f"⚠️ {multi_sentence_count}/{len(documents)} facts contain multiple sentences. "
355
+ f"Consider atomic splitting for better granularity."
356
+ )
357
+ if pronoun_count > 0:
358
+ logger.warning(
359
+ f"⚠️ {pronoun_count}/{len(documents)} facts contain pronouns. "
360
+ f"Consider coreference resolution."
361
+ )
362
+
363
+ # Log entity extraction statistics
364
+ total_entities = sum(len(doc.metadata.get('entities', [])) for doc in documents)
365
+ avg_entities = total_entities / len(documents) if documents else 0
366
+ logger.info(
367
+ f"Entity extraction complete: {total_entities} total entities "
368
+ f"({avg_entities:.1f} avg per fact)"
369
+ )
370
+
371
+ return documents
372
+ except Exception as e:
373
+ logger.exception("Error creating documents from DataFrame")
374
+ raise
375
+
376
+
377
+ class FactRetriever:
378
+ """
379
+ Retrieves similar facts from the vector store using semantic search.
380
+ Implements retrieval strategies and similarity scoring.
381
+ """
382
+
383
+ DEFAULT_INDEX_PATH = "faiss_index_facts"
384
+ EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"
385
+
386
+ def __init__(self, api_key: str = None, index_path: str = None):
387
+ """
388
+ Initialize the FactRetriever.
389
+
390
+ Args:
391
+ api_key: Together AI API key
392
+ index_path: Path to FAISS index
393
+ """
394
+ self.api_key = api_key or get_together_api_key()
395
+ self.index_path = index_path or self.DEFAULT_INDEX_PATH
396
+ logger.info(f"Initializing FactRetriever with index path: {self.index_path}")
397
+
398
+ try:
399
+ self.embeddings = TogetherEmbeddings(
400
+ model=self.EMBEDDING_MODEL,
401
+ api_key=self.api_key
402
+ )
403
+ logger.info(f"Embeddings model initialized: {self.EMBEDDING_MODEL}")
404
+ except Exception as e:
405
+ logger.exception("Error initializing embeddings model")
406
+ raise
407
+
408
+ self._vector_store = None
409
+
410
+ @property
411
+ def vector_store(self):
412
+ """Lazy load vector store to avoid unnecessary I/O."""
413
+ if self._vector_store is None:
414
+ try:
415
+ logger.info(f"Loading FAISS index from: {self.index_path}")
416
+ self._vector_store = FAISS.load_local(
417
+ self.index_path,
418
+ self.embeddings,
419
+ allow_dangerous_deserialization=True
420
+ )
421
+ logger.info("FAISS index loaded successfully")
422
+ except FileNotFoundError:
423
+ error_msg = f"FAISS index not found at: {self.index_path}. Please initialize the database first."
424
+ logger.error(error_msg)
425
+ raise FileNotFoundError(error_msg)
426
+ except Exception as e:
427
+ logger.exception("Error loading FAISS index")
428
+ raise RuntimeError(f"Error loading FAISS index: {str(e)}")
429
+ return self._vector_store
430
+
431
+ def retrieve(
432
+ self,
433
+ claim: str,
434
+ top_k: int = 3,
435
+ similarity_threshold: float = 0.0
436
+ ) -> List[Dict[str, Any]]:
437
+ """
438
+ Retrieve most similar verified facts for a given claim.
439
+
440
+ Args:
441
+ claim: The claim text to verify
442
+ top_k: Number of similar facts to retrieve
443
+ similarity_threshold: Minimum similarity score (0-1)
444
+
445
+ Returns:
446
+ List of dictionaries with 'fact', 'metadata', and 'similarity'
447
+ """
448
+ try:
449
+ logger.info(f"Retrieving top-{top_k} facts for claim: {claim[:100]}...")
450
+
451
+ # Perform similarity search with scores
452
+ docs_with_scores = self.vector_store.similarity_search_with_score(
453
+ claim, k=top_k
454
+ )
455
+ logger.debug(f"Retrieved {len(docs_with_scores)} documents from FAISS")
456
+
457
+ # Format and filter results
458
+ similar_facts = []
459
+ for doc, score in docs_with_scores:
460
+ # FAISS returns distance, convert to similarity
461
+ similarity = self._normalize_similarity(score)
462
+
463
+ if similarity >= similarity_threshold:
464
+ similar_facts.append({
465
+ 'fact': doc.page_content,
466
+ 'metadata': doc.metadata,
467
+ 'similarity': round(similarity, 3)
468
+ })
469
+ logger.debug(f"Fact similarity: {similarity:.3f} - {doc.page_content[:50]}...")
470
+
471
+ logger.info(f"Filtered to {len(similar_facts)} facts above threshold {similarity_threshold}")
472
+ return similar_facts
473
+
474
+ except Exception as e:
475
+ logger.exception("Error retrieving similar facts")
476
+ raise RuntimeError(f"Error retrieving similar facts: {str(e)}")
477
+
478
+ @staticmethod
479
+ def _normalize_similarity(distance: float) -> float:
480
+ """
481
+ Convert FAISS distance to similarity score (0-1 range).
482
+
483
+ Args:
484
+ distance: FAISS distance score (lower = more similar)
485
+
486
+ Returns:
487
+ Normalized similarity score
488
+ """
489
+ return 1 / (1 + distance)
490
+
491
+
492
+ class ClaimClassifier:
493
+ """
494
+ Uses LLM to classify claims as True/False/Unverifiable.
495
+ Handles prompt engineering and response parsing.
496
+ """
497
+
498
+ LLM_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
499
+ TEMPERATURE = 0.3
500
+
501
+ # Verdict constants
502
+ VERDICT_TRUE = "Likely True"
503
+ VERDICT_FALSE = "Likely False"
504
+ VERDICT_UNVERIFIABLE = "Unverifiable"
505
+
506
+ def __init__(self, api_key: str = None):
507
+ """
508
+ Initialize the ClaimClassifier.
509
+
510
+ Args:
511
+ api_key: Together AI API key
512
+ """
513
+ self.api_key = api_key or get_together_api_key()
514
+ logger.info(f"Initializing ClaimClassifier with model: {self.LLM_MODEL}")
515
+
516
+ try:
517
+ self.llm = ChatTogether(
518
+ model=self.LLM_MODEL,
519
+ temperature=self.TEMPERATURE,
520
+ api_key=self.api_key
521
+ )
522
+ logger.info(f"LLM initialized successfully (temperature={self.TEMPERATURE})")
523
+ except Exception as e:
524
+ logger.exception("Error initializing LLM")
525
+ raise
526
+
527
+ def classify(
528
+ self,
529
+ claim: str,
530
+ retrieved_facts: List[Dict[str, Any]]
531
+ ) -> Dict[str, Any]:
532
+ """
533
+ Classify a claim against retrieved facts using LLM.
534
+
535
+ Args:
536
+ claim: The original claim to verify
537
+ retrieved_facts: List of similar facts with metadata
538
+
539
+ Returns:
540
+ Dictionary with 'verdict', 'confidence', 'reasoning', 'evidence_used'
541
+ """
542
+ logger.info(f"Classifying claim with {len(retrieved_facts)} retrieved facts")
543
+
544
+ # Build prompt with evidence
545
+ prompt = self._build_prompt(claim, retrieved_facts)
546
+ logger.debug(f"Built prompt with {len(prompt)} characters")
547
+
548
+ try:
549
+ # Get LLM response
550
+ logger.info("Invoking LLM for claim classification")
551
+ response = self.llm.invoke([{"role": "user", "content": prompt}])
552
+ response_text = response.content.strip()
553
+ logger.debug(f"LLM response received ({len(response_text)} chars)")
554
+
555
+ # Parse JSON response
556
+ result = self._parse_response(response_text)
557
+ logger.info(f"Classification result: {result['verdict']} (confidence: {result['confidence']})")
558
+
559
+ # Add retrieved facts as evidence details
560
+ result['evidence_details'] = retrieved_facts
561
+
562
+ return result
563
+
564
+ except json.JSONDecodeError as e:
565
+ logger.error(f"JSON parsing failed: {str(e)}")
566
+ return self._fallback_response(retrieved_facts, "JSON parsing failed")
567
+ except Exception as e:
568
+ logger.exception("Error during claim classification")
569
+ return self._fallback_response(retrieved_facts, str(e))
570
+
571
+ def _build_prompt(
572
+ self,
573
+ claim: str,
574
+ retrieved_facts: List[Dict[str, Any]]
575
+ ) -> str:
576
+ """
577
+ Build the classification prompt for the LLM.
578
+
579
+ Args:
580
+ claim: The claim to verify
581
+ retrieved_facts: Retrieved evidence
582
+
583
+ Returns:
584
+ Formatted prompt string
585
+ """
586
+ # Format evidence
587
+ evidence_text = self._format_evidence(retrieved_facts)
588
+
589
+ # Construct prompt
590
+ prompt = f"""You are a fact-checking assistant. Your task is to verify the following claim against verified evidence.
591
+
592
+ CLAIM TO VERIFY:
593
+ "{claim}"
594
+
595
+ VERIFIED EVIDENCE FROM DATABASE:
596
+ {evidence_text}
597
+
598
+ INSTRUCTIONS:
599
+ 1. Compare the claim against the verified evidence carefully
600
+ 2. Classify the claim as one of:
601
+ - "{self.VERDICT_TRUE}" - if evidence strongly supports the claim
602
+ - "{self.VERDICT_FALSE}" - if evidence contradicts the claim
603
+ - "{self.VERDICT_UNVERIFIABLE}" - if insufficient or conflicting evidence
604
+
605
+ 3. Provide your analysis in EXACTLY this JSON format (no additional text):
606
+ {{
607
+ "verdict": "{self.VERDICT_TRUE}" | "{self.VERDICT_FALSE}" | "{self.VERDICT_UNVERIFIABLE}",
608
+ "confidence": "high" | "medium" | "low",
609
+ "reasoning": "Explain your decision in 2-3 sentences",
610
+ "evidence_used": ["fact 1", "fact 2"]
611
+ }}
612
+
613
+ IMPORTANT:
614
+ - Be objective and base your verdict only on the evidence provided
615
+ - If the evidence is vague or irrelevant, mark as "{self.VERDICT_UNVERIFIABLE}"
616
+ - Consider dates, entities, and specific details when comparing
617
+ - Return ONLY the JSON object, no other text
618
+
619
+ YOUR RESPONSE:"""
620
+
621
+ return prompt
622
+
623
+ def _format_evidence(self, retrieved_facts: List[Dict[str, Any]]) -> str:
624
+ """
625
+ Format retrieved facts for the prompt.
626
+
627
+ Args:
628
+ retrieved_facts: List of facts with metadata
629
+
630
+ Returns:
631
+ Formatted evidence string
632
+ """
633
+ if not retrieved_facts:
634
+ return "No similar verified facts found in the database."
635
+
636
+ evidence_lines = []
637
+ for i, fact in enumerate(retrieved_facts, 1):
638
+ lines = [
639
+ f"Evidence {i}:",
640
+ f"{fact['fact']}",
641
+ f"Source: {fact['metadata'].get('source', 'Unknown')}",
642
+ f"Date: {fact['metadata'].get('date', 'Unknown')}",
643
+ f"Similarity: {fact['similarity']:.2f}"
644
+ ]
645
+ evidence_lines.append("\n".join(lines))
646
+
647
+ return "\n\n".join(evidence_lines)
648
+
649
+ def _parse_response(self, response_text: str) -> Dict[str, Any]:
650
+ """
651
+ Parse LLM JSON response.
652
+
653
+ Args:
654
+ response_text: Raw LLM response
655
+
656
+ Returns:
657
+ Parsed result dictionary
658
+ """
659
+ try:
660
+ # Try to extract JSON if LLM added extra text
661
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
662
+ if json_match:
663
+ response_text = json_match.group(0)
664
+ logger.debug("Extracted JSON from LLM response")
665
+
666
+ result = json.loads(response_text)
667
+ logger.debug("Successfully parsed JSON response")
668
+
669
+ # Validate required fields
670
+ required_fields = ['verdict', 'confidence', 'reasoning', 'evidence_used']
671
+ missing_fields = [field for field in required_fields if field not in result]
672
+
673
+ if missing_fields:
674
+ logger.warning(f"Missing fields in LLM response: {missing_fields}")
675
+ for field in missing_fields:
676
+ result[field] = "Unknown" if field != 'evidence_used' else []
677
+
678
+ return result
679
+ except Exception as e:
680
+ logger.exception("Error parsing LLM response")
681
+ raise
682
+
683
+ def _fallback_response(
684
+ self,
685
+ retrieved_facts: List[Dict[str, Any]],
686
+ error_msg: str
687
+ ) -> Dict[str, Any]:
688
+ """
689
+ Create fallback response on error.
690
+
691
+ Args:
692
+ retrieved_facts: Retrieved evidence
693
+ error_msg: Error message
694
+
695
+ Returns:
696
+ Fallback response dictionary
697
+ """
698
+ logger.warning(f"Creating fallback response due to: {error_msg}")
699
+ return {
700
+ 'verdict': self.VERDICT_UNVERIFIABLE,
701
+ 'confidence': 'low',
702
+ 'reasoning': f'Error during fact-checking: {error_msg}',
703
+ 'evidence_used': [],
704
+ 'evidence_details': retrieved_facts,
705
+ 'error': error_msg
706
+ }
707
+
708
+
709
+ class FactChecker:
710
+ """
711
+ Main orchestrator for the fact-checking pipeline.
712
+ Coordinates ClaimExtractor, FactRetriever, and ClaimClassifier.
713
+ Follows Facade pattern to provide simple interface.
714
+ """
715
+
716
+ def __init__(self, api_key: str = None):
717
+ """
718
+ Initialize the FactChecker with all required components.
719
+
720
+ Args:
721
+ api_key: Together AI API key
722
+ """
723
+ logger.info("Initializing FactChecker pipeline")
724
+ self.api_key = api_key or get_together_api_key()
725
+
726
+ try:
727
+ # Initialize components (Dependency Injection)
728
+ logger.debug("Initializing ClaimExtractor")
729
+ self.claim_extractor = ClaimExtractor()
730
+
731
+ logger.debug("Initializing FactRetriever")
732
+ self.fact_retriever = FactRetriever(api_key=self.api_key)
733
+
734
+ logger.debug("Initializing ClaimClassifier")
735
+ self.claim_classifier = ClaimClassifier(api_key=self.api_key)
736
+
737
+ logger.info("FactChecker initialization complete")
738
+ except Exception as e:
739
+ logger.exception("Error initializing FactChecker")
740
+ raise
741
+
742
+ def check_claim(self, user_claim: str, top_k: int = 3) -> Dict[str, Any]:
743
+ """
744
+ Main fact-checking pipeline that orchestrates the entire process.
745
+
746
+ Args:
747
+ user_claim: User's input claim/statement to verify
748
+ top_k: Number of similar facts to retrieve
749
+
750
+ Returns:
751
+ Complete fact-check result with verdict, evidence, and reasoning
752
+ """
753
+ logger.info("=" * 60)
754
+ logger.info(f"Starting fact-check pipeline for claim: {user_claim[:100]}...")
755
+ logger.info("=" * 60)
756
+
757
+ try:
758
+ # Step 1: Extract claims from input
759
+ logger.info("Step 1: Extracting claims from input")
760
+ claims = self.claim_extractor.extract_claims(user_claim)
761
+
762
+ # For simplicity, fact-check the first/main claim
763
+ main_claim = claims[0]['text'] if claims else user_claim
764
+ logger.info(f"Main claim identified: {main_claim[:100]}...")
765
+
766
+ # Step 2: Retrieve similar facts
767
+ logger.info(f"Step 2: Retrieving top-{top_k} similar facts")
768
+ similar_facts = self.fact_retriever.retrieve(main_claim, top_k=top_k)
769
+ logger.info(f"Retrieved {len(similar_facts)} similar facts")
770
+
771
+ # Step 3: Classify using LLM
772
+ logger.info("Step 3: Classifying claim using LLM")
773
+ result = self.claim_classifier.classify(main_claim, similar_facts)
774
+
775
+ # Step 4: Add metadata
776
+ logger.info("Step 4: Adding metadata to result")
777
+ result['original_input'] = user_claim
778
+ result['extracted_claim'] = main_claim
779
+ result['entities_found'] = claims[0].get('entities', []) if claims else []
780
+ result['total_claims_extracted'] = len(claims)
781
+
782
+ logger.info(f"Fact-check complete: {result['verdict']}")
783
+ logger.info("=" * 60)
784
+ return result
785
+
786
+ except Exception as e:
787
+ logger.exception("Error in fact-checking pipeline")
788
+ logger.info("=" * 60)
789
+ return self._error_response(user_claim, str(e))
790
+
791
+ def _error_response(self, user_claim: str, error_msg: str) -> Dict[str, Any]:
792
+ """
793
+ Create error response when pipeline fails.
794
+
795
+ Args:
796
+ user_claim: Original user claim
797
+ error_msg: Error message
798
+
799
+ Returns:
800
+ Error response dictionary
801
+ """
802
+ logger.error(f"Creating error response for claim: {error_msg}")
803
+ return {
804
+ 'verdict': 'Unverifiable',
805
+ 'confidence': 'low',
806
+ 'reasoning': f'Error during fact-checking pipeline: {error_msg}',
807
+ 'evidence_used': [],
808
+ 'evidence_details': [],
809
+ 'original_input': user_claim,
810
+ 'extracted_claim': user_claim,
811
+ 'entities_found': [],
812
+ 'error': error_msg
813
+ }
814
+
815
+
816
+ # ========================================================================
817
+ # LEGACY FUNCTION WRAPPERS (for backward compatibility)
818
+ # ========================================================================
819
+
820
+ def load_verified_facts(csv_path: str = "verified_facts_db.csv") -> str:
821
+ """
822
+ Legacy wrapper for backward compatibility.
823
+ Uses FactsDatabase class internally.
824
+
825
+ Args:
826
+ csv_path: Path to verified facts CSV file
827
+
828
+ Returns:
829
+ Status message
830
+ """
831
+ db = FactsDatabase()
832
+ return db.load_from_csv(csv_path)
833
+
834
+
835
+ def retrieve_similar_facts(
836
+ claim: str,
837
+ top_k: int = 3,
838
+ similarity_threshold: float = 0.0
839
+ ) -> List[Dict[str, Any]]:
840
+ """
841
+ Legacy wrapper for backward compatibility.
842
+ Uses FactRetriever class internally.
843
+
844
+ Args:
845
+ claim: The claim text to verify
846
+ top_k: Number of similar facts to retrieve
847
+ similarity_threshold: Minimum similarity score (0-1)
848
+
849
+ Returns:
850
+ List of dictionaries with 'fact', 'metadata', and 'similarity'
851
+ """
852
+ retriever = FactRetriever()
853
+ return retriever.retrieve(claim, top_k, similarity_threshold)
854
+
855
+
856
+ def classify_claim(claim: str, retrieved_facts: List[Dict[str, Any]]) -> Dict[str, Any]:
857
+ """
858
+ Legacy wrapper for backward compatibility.
859
+ Uses ClaimClassifier class internally.
860
+
861
+ Args:
862
+ claim: The original claim to verify
863
+ retrieved_facts: List of similar facts with metadata
864
+
865
+ Returns:
866
+ Dictionary with 'verdict', 'confidence', 'reasoning', 'evidence_used'
867
+ """
868
+ classifier = ClaimClassifier()
869
+ return classifier.classify(claim, retrieved_facts)
870
+
871
+
872
+ def fact_check_claim(user_claim: str, top_k: int = 3) -> Dict[str, Any]:
873
+ """
874
+ Legacy wrapper for backward compatibility.
875
+ Uses FactChecker class internally.
876
+
877
+ Args:
878
+ user_claim: User's input claim/statement to verify
879
+ top_k: Number of similar facts to retrieve
880
+
881
+ Returns:
882
+ Complete fact-check result with verdict, evidence, and reasoning
883
+ """
884
+ checker = FactChecker()
885
+ return checker.check_claim(user_claim, top_k)