File size: 31,852 Bytes
a3075d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
import os
import logging
from typing import List, Dict, Any
from dotenv import load_dotenv
from langchain.schema import Document as LangchainDocument
from langchain_community.vectorstores import FAISS
from langchain_together.chat_models import ChatTogether
from langchain_together.embeddings import TogetherEmbeddings
import spacy
import pandas as pd
import json
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('fact_checker.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

load_dotenv()
logger.info("Environment variables loaded")

# ---------- API Key Helper -------------------------------------------------
def get_together_api_key() -> str:
    """Get Together AI API key from environment variables."""
    try:
        key = os.getenv("TOGETHER_API_KEY")
        if key:
            logger.info("Together AI API key found")
            return key

        # If not found, raise error
        error_msg = (
            "TOGETHER_API_KEY not found. Please set it in one of these ways:\n"
            "1. Create a .env file with: TOGETHER_API_KEY=your_key_here\n"
            "2. Set environment variable: export TOGETHER_API_KEY=your_key_here"
        )
        logger.error(error_msg)
        raise EnvironmentError(error_msg)
    except Exception as e:
        logger.exception("Error retrieving Together AI API key")
        raise


# ========================================================================
# FACT-CHECKING SYSTEM COMPONENTS (OOP Architecture)
# ========================================================================

class ClaimExtractor:
    """
    Handles claim and entity extraction using NLP (spaCy).
    Follows Single Responsibility Principle.
    """

    # Supported entity types for extraction
    ENTITY_TYPES = ['ORG', 'GPE', 'PERSON', 'DATE', 'EVENT', 'MONEY',
                    'PERCENT', 'LAW', 'PRODUCT']

    def __init__(self, model_name: str = "en_core_web_sm"):
        """
        Initialize the ClaimExtractor with a spaCy model.

        Args:
            model_name: Name of the spaCy model to use
        """
        self.model_name = model_name
        self._nlp = None

    @property
    def nlp(self):
        """Lazy load spaCy model to avoid startup overhead."""
        if self._nlp is None:
            try:
                logger.info(f"Loading spaCy model: {self.model_name}")
                self._nlp = spacy.load(self.model_name)
                logger.info(f"Successfully loaded spaCy model: {self.model_name}")
            except OSError as e:
                logger.error(f"spaCy model '{self.model_name}' not found")
                raise RuntimeError(
                    f"spaCy model '{self.model_name}' not found. "
                    f"Please install it with: python -m spacy download {self.model_name}"
                )
            except Exception as e:
                logger.exception(f"Unexpected error loading spaCy model: {self.model_name}")
                raise
        return self._nlp

    def extract_entities(self, doc) -> List[Dict[str, Any]]:
        """
        Extract named entities from a spaCy document.

        Args:
            doc: spaCy document object

        Returns:
            List of entity dictionaries with text, type, and position
        """
        try:
            entities = []
            for ent in doc.ents:
                if ent.label_ in self.ENTITY_TYPES:
                    entities.append({
                        'text': ent.text,
                        'type': ent.label_,
                        'start': ent.start_char,
                        'end': ent.end_char
                    })
            logger.debug(f"Extracted {len(entities)} entities")
            return entities
        except Exception as e:
            logger.exception("Error extracting entities")
            return []

    def extract_claims(self, text: str, min_length: int = 10) -> List[Dict[str, Any]]:
        """
        Extract key claims and named entities from input text.

        Args:
            text: Input text (e.g., news post, social media statement)
            min_length: Minimum length for a sentence to be considered a claim

        Returns:
            List of claim dictionaries with 'text', 'type', and 'entities'
        """
        try:
            logger.info(f"Extracting claims from text ({len(text)} chars)")
            doc = self.nlp(text)
            entities = self.extract_entities(doc)

            # Extract sentences as potential claims
            claims = []
            for sent in doc.sents:
                sent_text = sent.text.strip()
                if len(sent_text) >= min_length:
                    # Find entities in this sentence
                    sent_entities = [
                        e for e in entities
                        if e['start'] >= sent.start_char and e['end'] <= sent.end_char
                    ]

                    claims.append({
                        'text': sent_text,
                        'type': 'statement',
                        'entities': sent_entities
                    })

            # If no claims extracted, treat entire text as one claim
            if not claims:
                logger.debug("No sentences found, using entire text as claim")
                claims.append({
                    'text': text.strip(),
                    'type': 'statement',
                    'entities': entities
                })

            logger.info(f"Extracted {len(claims)} claim(s)")
            return claims
        except Exception as e:
            logger.exception("Error extracting claims")
            # Return fallback claim
            return [{
                'text': text.strip(),
                'type': 'statement',
                'entities': []
            }]


class FactsDatabase:
    """
    Manages the verified facts database and vector store.
    Handles loading, embedding, and persistence.
    """

    DEFAULT_CSV_PATH = "verified_facts_db.csv"
    DEFAULT_INDEX_PATH = "faiss_index_facts"
    EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"

    def __init__(self, api_key: str = None):
        """
        Initialize the FactsDatabase.

        Args:
            api_key: Together AI API key (optional, can use get_together_api_key)
        """
        logger.info("Initializing FactsDatabase")
        self.api_key = api_key or get_together_api_key()

        try:
            self.embeddings = TogetherEmbeddings(
                model=self.EMBEDDING_MODEL,
                api_key=self.api_key
            )
            logger.info(f"Embeddings initialized with model: {self.EMBEDDING_MODEL}")

            # Initialize ClaimExtractor for entity extraction from facts
            self.claim_extractor = ClaimExtractor()
            logger.info("ClaimExtractor initialized for database entity extraction")

        except Exception as e:
            logger.exception("Error initializing embeddings")
            raise

    def load_from_csv(
        self,
        csv_path: str = None,
        index_path: str = None
    ) -> str:
        """
        Load verified facts from CSV and create FAISS vector store.

        Args:
            csv_path: Path to verified facts CSV file
            index_path: Path to save FAISS index

        Returns:
            Status message with count of loaded facts
        """
        csv_path = csv_path or self.DEFAULT_CSV_PATH
        index_path = index_path or self.DEFAULT_INDEX_PATH

        try:
            logger.info(f"Loading facts from CSV: {csv_path}")
            # Read verified facts
            df = pd.read_csv(csv_path)
            logger.info(f"Loaded {len(df)} rows from CSV")

            # Handle different CSV formats
            if 'fact_text' in df.columns:
                fact_column = 'fact_text'
                logger.debug("Using 'fact_text' column")
            elif 'fact' in df.columns:
                fact_column = 'fact'
                logger.debug("Using 'fact' column")
            else:
                error_msg = "CSV must contain a 'fact' or 'fact_text' column"
                logger.error(error_msg)
                raise ValueError(error_msg)

            # Create documents with metadata
            logger.info("Creating documents with metadata")
            documents = self._create_documents(df, fact_column)
            logger.info(f"Created {len(documents)} documents")

            # Create FAISS index
            logger.info("Creating FAISS vector index...")
            vector_store = FAISS.from_documents(documents, self.embeddings)
            logger.info("FAISS index created successfully")

            # Save to disk
            logger.info(f"Saving FAISS index to: {index_path}")
            vector_store.save_local(index_path)
            logger.info("FAISS index saved successfully")

            return f"✅ Successfully loaded {len(documents)} verified facts into vector store"

        except FileNotFoundError:
            raise FileNotFoundError(f"Verified facts CSV not found at: {csv_path}")
        except Exception as e:
            raise RuntimeError(f"Error loading verified facts: {str(e)}")

    def _create_documents(
        self,
        df: pd.DataFrame,
        fact_column: str
    ) -> List[LangchainDocument]:
        """
        Create LangChain documents from DataFrame with entity extraction.

        Args:
            df: Pandas DataFrame with facts
            fact_column: Name of the column containing fact text

        Returns:
            List of LangChain documents with metadata including extracted entities
        """
        try:
            documents = []
            multi_sentence_count = 0
            pronoun_count = 0

            for idx, row in df.iterrows():
                fact_text = row[fact_column]

                # Extract fact_id if available
                if 'fact_id' in df.columns:
                    fact_id = row['fact_id']
                else:
                    fact_id = f"F{idx:03d}"

                # DATA VALIDATION: Check for multi-sentence facts
                sentences = fact_text.split('.')
                if len([s for s in sentences if s.strip()]) > 1:
                    multi_sentence_count += 1
                    logger.warning(
                        f"Fact {fact_id} contains multiple sentences ({len(sentences)} sentences). "
                        f"Consider splitting for better retrieval: {fact_text[:80]}..."
                    )

                # DATA VALIDATION: Check for unresolved pronouns
                pronouns = ['he ', 'she ', 'it ', 'they ', 'them ', 'his ', 'her ', 'their ']
                if any(pronoun in fact_text.lower() for pronoun in pronouns):
                    pronoun_count += 1
                    logger.warning(
                        f"Fact {fact_id} contains pronouns - may cause coreference issues: {fact_text[:80]}..."
                    )

                # ENTITY EXTRACTION: Extract entities from fact text
                entities = []
                entities_dict = {}
                try:
                    claims = self.claim_extractor.extract_claims(fact_text)
                    if claims and len(claims) > 0:
                        entities = claims[0].get('entities', [])
                        # Convert entities list to dict for easier access
                        entities_dict = {
                            'organizations': [e['text'] for e in entities if e['type'] in ['ORG', 'ORGANIZATION']],
                            'locations': [e['text'] for e in entities if e['type'] in ['GPE', 'LOC', 'LOCATION']],
                            'persons': [e['text'] for e in entities if e['type'] in ['PERSON', 'PER']],
                            'dates': [e['text'] for e in entities if e['type'] == 'DATE'],
                            'percentages': [e['text'] for e in entities if e['type'] in ['PERCENT', 'PERCENTAGE']],
                            'money': [e['text'] for e in entities if e['type'] in ['MONEY', 'CURRENCY']],
                            'all_entities': [e['text'] for e in entities]
                        }
                        logger.debug(f"Fact {fact_id}: Extracted {len(entities)} entities")
                except Exception as e:
                    logger.warning(f"Failed to extract entities from fact {fact_id}: {str(e)}")

                # Create metadata with entities
                metadata = {
                    'source': row.get('source', 'Verified Database'),
                    'date': row.get('date', 'N/A'),
                    'category': row.get('category', 'General'),
                    'fact_id': fact_id,
                    'entities': entities,  # Full entity list with types
                    'entities_dict': entities_dict  # Organized by type for easy filtering
                }

                # Create LangChain document with metadata
                doc = LangchainDocument(
                    page_content=fact_text,
                    metadata=metadata
                )
                documents.append(doc)

            # Summary logging
            logger.info(f"Created {len(documents)} documents from DataFrame")
            if multi_sentence_count > 0:
                logger.warning(
                    f"⚠️  {multi_sentence_count}/{len(documents)} facts contain multiple sentences. "
                    f"Consider atomic splitting for better granularity."
                )
            if pronoun_count > 0:
                logger.warning(
                    f"⚠️  {pronoun_count}/{len(documents)} facts contain pronouns. "
                    f"Consider coreference resolution."
                )

            # Log entity extraction statistics
            total_entities = sum(len(doc.metadata.get('entities', [])) for doc in documents)
            avg_entities = total_entities / len(documents) if documents else 0
            logger.info(
                f"Entity extraction complete: {total_entities} total entities "
                f"({avg_entities:.1f} avg per fact)"
            )

            return documents
        except Exception as e:
            logger.exception("Error creating documents from DataFrame")
            raise


class FactRetriever:
    """
    Retrieves similar facts from the vector store using semantic search.
    Implements retrieval strategies and similarity scoring.
    """

    DEFAULT_INDEX_PATH = "faiss_index_facts"
    EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"

    def __init__(self, api_key: str = None, index_path: str = None):
        """
        Initialize the FactRetriever.

        Args:
            api_key: Together AI API key
            index_path: Path to FAISS index
        """
        self.api_key = api_key or get_together_api_key()
        self.index_path = index_path or self.DEFAULT_INDEX_PATH
        logger.info(f"Initializing FactRetriever with index path: {self.index_path}")

        try:
            self.embeddings = TogetherEmbeddings(
                model=self.EMBEDDING_MODEL,
                api_key=self.api_key
            )
            logger.info(f"Embeddings model initialized: {self.EMBEDDING_MODEL}")
        except Exception as e:
            logger.exception("Error initializing embeddings model")
            raise

        self._vector_store = None

    @property
    def vector_store(self):
        """Lazy load vector store to avoid unnecessary I/O."""
        if self._vector_store is None:
            try:
                logger.info(f"Loading FAISS index from: {self.index_path}")
                self._vector_store = FAISS.load_local(
                    self.index_path,
                    self.embeddings,
                    allow_dangerous_deserialization=True
                )
                logger.info("FAISS index loaded successfully")
            except FileNotFoundError:
                error_msg = f"FAISS index not found at: {self.index_path}. Please initialize the database first."
                logger.error(error_msg)
                raise FileNotFoundError(error_msg)
            except Exception as e:
                logger.exception("Error loading FAISS index")
                raise RuntimeError(f"Error loading FAISS index: {str(e)}")
        return self._vector_store

    def retrieve(
        self,
        claim: str,
        top_k: int = 3,
        similarity_threshold: float = 0.0
    ) -> List[Dict[str, Any]]:
        """
        Retrieve most similar verified facts for a given claim.

        Args:
            claim: The claim text to verify
            top_k: Number of similar facts to retrieve
            similarity_threshold: Minimum similarity score (0-1)

        Returns:
            List of dictionaries with 'fact', 'metadata', and 'similarity'
        """
        try:
            logger.info(f"Retrieving top-{top_k} facts for claim: {claim[:100]}...")

            # Perform similarity search with scores
            docs_with_scores = self.vector_store.similarity_search_with_score(
                claim, k=top_k
            )
            logger.debug(f"Retrieved {len(docs_with_scores)} documents from FAISS")

            # Format and filter results
            similar_facts = []
            for doc, score in docs_with_scores:
                # FAISS returns distance, convert to similarity
                similarity = self._normalize_similarity(score)

                if similarity >= similarity_threshold:
                    similar_facts.append({
                        'fact': doc.page_content,
                        'metadata': doc.metadata,
                        'similarity': round(similarity, 3)
                    })
                    logger.debug(f"Fact similarity: {similarity:.3f} - {doc.page_content[:50]}...")

            logger.info(f"Filtered to {len(similar_facts)} facts above threshold {similarity_threshold}")
            return similar_facts

        except Exception as e:
            logger.exception("Error retrieving similar facts")
            raise RuntimeError(f"Error retrieving similar facts: {str(e)}")

    @staticmethod
    def _normalize_similarity(distance: float) -> float:
        """
        Convert FAISS distance to similarity score (0-1 range).

        Args:
            distance: FAISS distance score (lower = more similar)

        Returns:
            Normalized similarity score
        """
        return 1 / (1 + distance)


class ClaimClassifier:
    """
    Uses LLM to classify claims as True/False/Unverifiable.
    Handles prompt engineering and response parsing.
    """

    LLM_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
    TEMPERATURE = 0.3

    # Verdict constants
    VERDICT_TRUE = "Likely True"
    VERDICT_FALSE = "Likely False"
    VERDICT_UNVERIFIABLE = "Unverifiable"

    def __init__(self, api_key: str = None):
        """
        Initialize the ClaimClassifier.

        Args:
            api_key: Together AI API key
        """
        self.api_key = api_key or get_together_api_key()
        logger.info(f"Initializing ClaimClassifier with model: {self.LLM_MODEL}")

        try:
            self.llm = ChatTogether(
                model=self.LLM_MODEL,
                temperature=self.TEMPERATURE,
                api_key=self.api_key
            )
            logger.info(f"LLM initialized successfully (temperature={self.TEMPERATURE})")
        except Exception as e:
            logger.exception("Error initializing LLM")
            raise

    def classify(
        self,
        claim: str,
        retrieved_facts: List[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """
        Classify a claim against retrieved facts using LLM.

        Args:
            claim: The original claim to verify
            retrieved_facts: List of similar facts with metadata

        Returns:
            Dictionary with 'verdict', 'confidence', 'reasoning', 'evidence_used'
        """
        logger.info(f"Classifying claim with {len(retrieved_facts)} retrieved facts")

        # Build prompt with evidence
        prompt = self._build_prompt(claim, retrieved_facts)
        logger.debug(f"Built prompt with {len(prompt)} characters")

        try:
            # Get LLM response
            logger.info("Invoking LLM for claim classification")
            response = self.llm.invoke([{"role": "user", "content": prompt}])
            response_text = response.content.strip()
            logger.debug(f"LLM response received ({len(response_text)} chars)")

            # Parse JSON response
            result = self._parse_response(response_text)
            logger.info(f"Classification result: {result['verdict']} (confidence: {result['confidence']})")

            # Add retrieved facts as evidence details
            result['evidence_details'] = retrieved_facts

            return result

        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing failed: {str(e)}")
            return self._fallback_response(retrieved_facts, "JSON parsing failed")
        except Exception as e:
            logger.exception("Error during claim classification")
            return self._fallback_response(retrieved_facts, str(e))

    def _build_prompt(
        self,
        claim: str,
        retrieved_facts: List[Dict[str, Any]]
    ) -> str:
        """
        Build the classification prompt for the LLM.

        Args:
            claim: The claim to verify
            retrieved_facts: Retrieved evidence

        Returns:
            Formatted prompt string
        """
        # Format evidence
        evidence_text = self._format_evidence(retrieved_facts)

        # Construct prompt
        prompt = f"""You are a fact-checking assistant. Your task is to verify the following claim against verified evidence.

CLAIM TO VERIFY:
"{claim}"

VERIFIED EVIDENCE FROM DATABASE:
{evidence_text}

INSTRUCTIONS:
1. Compare the claim against the verified evidence carefully
2. Classify the claim as one of:
   - "{self.VERDICT_TRUE}" - if evidence strongly supports the claim
   - "{self.VERDICT_FALSE}" - if evidence contradicts the claim
   - "{self.VERDICT_UNVERIFIABLE}" - if insufficient or conflicting evidence

3. Provide your analysis in EXACTLY this JSON format (no additional text):
{{
  "verdict": "{self.VERDICT_TRUE}" | "{self.VERDICT_FALSE}" | "{self.VERDICT_UNVERIFIABLE}",
  "confidence": "high" | "medium" | "low",
  "reasoning": "Explain your decision in 2-3 sentences",
  "evidence_used": ["fact 1", "fact 2"]
}}

IMPORTANT:
- Be objective and base your verdict only on the evidence provided
- If the evidence is vague or irrelevant, mark as "{self.VERDICT_UNVERIFIABLE}"
- Consider dates, entities, and specific details when comparing
- Return ONLY the JSON object, no other text

YOUR RESPONSE:"""

        return prompt

    def _format_evidence(self, retrieved_facts: List[Dict[str, Any]]) -> str:
        """
        Format retrieved facts for the prompt.

        Args:
            retrieved_facts: List of facts with metadata

        Returns:
            Formatted evidence string
        """
        if not retrieved_facts:
            return "No similar verified facts found in the database."

        evidence_lines = []
        for i, fact in enumerate(retrieved_facts, 1):
            lines = [
                f"Evidence {i}:",
                f"{fact['fact']}",
                f"Source: {fact['metadata'].get('source', 'Unknown')}",
                f"Date: {fact['metadata'].get('date', 'Unknown')}",
                f"Similarity: {fact['similarity']:.2f}"
            ]
            evidence_lines.append("\n".join(lines))

        return "\n\n".join(evidence_lines)

    def _parse_response(self, response_text: str) -> Dict[str, Any]:
        """
        Parse LLM JSON response.

        Args:
            response_text: Raw LLM response

        Returns:
            Parsed result dictionary
        """
        try:
            # Try to extract JSON if LLM added extra text
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                response_text = json_match.group(0)
                logger.debug("Extracted JSON from LLM response")

            result = json.loads(response_text)
            logger.debug("Successfully parsed JSON response")

            # Validate required fields
            required_fields = ['verdict', 'confidence', 'reasoning', 'evidence_used']
            missing_fields = [field for field in required_fields if field not in result]

            if missing_fields:
                logger.warning(f"Missing fields in LLM response: {missing_fields}")
                for field in missing_fields:
                    result[field] = "Unknown" if field != 'evidence_used' else []

            return result
        except Exception as e:
            logger.exception("Error parsing LLM response")
            raise

    def _fallback_response(
        self,
        retrieved_facts: List[Dict[str, Any]],
        error_msg: str
    ) -> Dict[str, Any]:
        """
        Create fallback response on error.

        Args:
            retrieved_facts: Retrieved evidence
            error_msg: Error message

        Returns:
            Fallback response dictionary
        """
        logger.warning(f"Creating fallback response due to: {error_msg}")
        return {
            'verdict': self.VERDICT_UNVERIFIABLE,
            'confidence': 'low',
            'reasoning': f'Error during fact-checking: {error_msg}',
            'evidence_used': [],
            'evidence_details': retrieved_facts,
            'error': error_msg
        }


class FactChecker:
    """
    Main orchestrator for the fact-checking pipeline.
    Coordinates ClaimExtractor, FactRetriever, and ClaimClassifier.
    Follows Facade pattern to provide simple interface.
    """

    def __init__(self, api_key: str = None):
        """
        Initialize the FactChecker with all required components.

        Args:
            api_key: Together AI API key
        """
        logger.info("Initializing FactChecker pipeline")
        self.api_key = api_key or get_together_api_key()

        try:
            # Initialize components (Dependency Injection)
            logger.debug("Initializing ClaimExtractor")
            self.claim_extractor = ClaimExtractor()

            logger.debug("Initializing FactRetriever")
            self.fact_retriever = FactRetriever(api_key=self.api_key)

            logger.debug("Initializing ClaimClassifier")
            self.claim_classifier = ClaimClassifier(api_key=self.api_key)

            logger.info("FactChecker initialization complete")
        except Exception as e:
            logger.exception("Error initializing FactChecker")
            raise

    def check_claim(self, user_claim: str, top_k: int = 3) -> Dict[str, Any]:
        """
        Main fact-checking pipeline that orchestrates the entire process.

        Args:
            user_claim: User's input claim/statement to verify
            top_k: Number of similar facts to retrieve

        Returns:
            Complete fact-check result with verdict, evidence, and reasoning
        """
        logger.info("=" * 60)
        logger.info(f"Starting fact-check pipeline for claim: {user_claim[:100]}...")
        logger.info("=" * 60)

        try:
            # Step 1: Extract claims from input
            logger.info("Step 1: Extracting claims from input")
            claims = self.claim_extractor.extract_claims(user_claim)

            # For simplicity, fact-check the first/main claim
            main_claim = claims[0]['text'] if claims else user_claim
            logger.info(f"Main claim identified: {main_claim[:100]}...")

            # Step 2: Retrieve similar facts
            logger.info(f"Step 2: Retrieving top-{top_k} similar facts")
            similar_facts = self.fact_retriever.retrieve(main_claim, top_k=top_k)
            logger.info(f"Retrieved {len(similar_facts)} similar facts")

            # Step 3: Classify using LLM
            logger.info("Step 3: Classifying claim using LLM")
            result = self.claim_classifier.classify(main_claim, similar_facts)

            # Step 4: Add metadata
            logger.info("Step 4: Adding metadata to result")
            result['original_input'] = user_claim
            result['extracted_claim'] = main_claim
            result['entities_found'] = claims[0].get('entities', []) if claims else []
            result['total_claims_extracted'] = len(claims)

            logger.info(f"Fact-check complete: {result['verdict']}")
            logger.info("=" * 60)
            return result

        except Exception as e:
            logger.exception("Error in fact-checking pipeline")
            logger.info("=" * 60)
            return self._error_response(user_claim, str(e))

    def _error_response(self, user_claim: str, error_msg: str) -> Dict[str, Any]:
        """
        Create error response when pipeline fails.

        Args:
            user_claim: Original user claim
            error_msg: Error message

        Returns:
            Error response dictionary
        """
        logger.error(f"Creating error response for claim: {error_msg}")
        return {
            'verdict': 'Unverifiable',
            'confidence': 'low',
            'reasoning': f'Error during fact-checking pipeline: {error_msg}',
            'evidence_used': [],
            'evidence_details': [],
            'original_input': user_claim,
            'extracted_claim': user_claim,
            'entities_found': [],
            'error': error_msg
        }


# ========================================================================
# LEGACY FUNCTION WRAPPERS (for backward compatibility)
# ========================================================================

def load_verified_facts(csv_path: str = "verified_facts_db.csv") -> str:
    """
    Legacy wrapper for backward compatibility.
    Uses FactsDatabase class internally.

    Args:
        csv_path: Path to verified facts CSV file

    Returns:
        Status message
    """
    db = FactsDatabase()
    return db.load_from_csv(csv_path)


def retrieve_similar_facts(
    claim: str,
    top_k: int = 3,
    similarity_threshold: float = 0.0
) -> List[Dict[str, Any]]:
    """
    Legacy wrapper for backward compatibility.
    Uses FactRetriever class internally.

    Args:
        claim: The claim text to verify
        top_k: Number of similar facts to retrieve
        similarity_threshold: Minimum similarity score (0-1)

    Returns:
        List of dictionaries with 'fact', 'metadata', and 'similarity'
    """
    retriever = FactRetriever()
    return retriever.retrieve(claim, top_k, similarity_threshold)


def classify_claim(claim: str, retrieved_facts: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Legacy wrapper for backward compatibility.
    Uses ClaimClassifier class internally.

    Args:
        claim: The original claim to verify
        retrieved_facts: List of similar facts with metadata

    Returns:
        Dictionary with 'verdict', 'confidence', 'reasoning', 'evidence_used'
    """
    classifier = ClaimClassifier()
    return classifier.classify(claim, retrieved_facts)


def fact_check_claim(user_claim: str, top_k: int = 3) -> Dict[str, Any]:
    """
    Legacy wrapper for backward compatibility.
    Uses FactChecker class internally.

    Args:
        user_claim: User's input claim/statement to verify
        top_k: Number of similar facts to retrieve

    Returns:
        Complete fact-check result with verdict, evidence, and reasoning
    """
    checker = FactChecker()
    return checker.check_claim(user_claim, top_k)