Spaces:

adi-123
/

Fact_Checker

Running

File size: 31,852 Bytes

a3075d5

import os
import logging
from typing import List, Dict, Any
from dotenv import load_dotenv
from langchain.schema import Document as LangchainDocument
from langchain_community.vectorstores import FAISS
from langchain_together.chat_models import ChatTogether
from langchain_together.embeddings import TogetherEmbeddings
import spacy
import pandas as pd
import json
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('fact_checker.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

load_dotenv()
logger.info("Environment variables loaded")

# ---------- API Key Helper -------------------------------------------------
def get_together_api_key() -> str:
    """Get Together AI API key from environment variables."""
    try:
        key = os.getenv("TOGETHER_API_KEY")
        if key:
            logger.info("Together AI API key found")
            return key

        # If not found, raise error
        error_msg = (
            "TOGETHER_API_KEY not found. Please set it in one of these ways:\n"
            "1. Create a .env file with: TOGETHER_API_KEY=your_key_here\n"
            "2. Set environment variable: export TOGETHER_API_KEY=your_key_here"
        )
        logger.error(error_msg)
        raise EnvironmentError(error_msg)
    except Exception as e:
        logger.exception("Error retrieving Together AI API key")
        raise


# ========================================================================
# FACT-CHECKING SYSTEM COMPONENTS (OOP Architecture)
# ========================================================================

class ClaimExtractor:
    """
    Handles claim and entity extraction using NLP (spaCy).
    Follows Single Responsibility Principle.
    """

    # Supported entity types for extraction
    ENTITY_TYPES = ['ORG', 'GPE', 'PERSON', 'DATE', 'EVENT', 'MONEY',
                    'PERCENT', 'LAW', 'PRODUCT']

    def __init__(self, model_name: str = "en_core_web_sm"):
        """
        Initialize the ClaimExtractor with a spaCy model.

        Args:
            model_name: Name of the spaCy model to use
        """
        self.model_name = model_name
        self._nlp = None

    @property
    def nlp(self):
        """Lazy load spaCy model to avoid startup overhead."""
        if self._nlp is None:
            try:
                logger.info(f"Loading spaCy model: {self.model_name}")
                self._nlp = spacy.load(self.model_name)
                logger.info(f"Successfully loaded spaCy model: {self.model_name}")
            except OSError as e:
                logger.error(f"spaCy model '{self.model_name}' not found")
                raise RuntimeError(
                    f"spaCy model '{self.model_name}' not found. "
                    f"Please install it with: python -m spacy download {self.model_name}"
                )
            except Exception as e:
                logger.exception(f"Unexpected error loading spaCy model: {self.model_name}")
                raise
        return self._nlp

    def extract_entities(self, doc) -> List[Dict[str, Any]]:
        """
        Extract named entities from a spaCy document.

        Args:
            doc: spaCy document object

        Returns:
            List of entity dictionaries with text, type, and position
        """
        try:
            entities = []
            for ent in doc.ents:
                if ent.label_ in self.ENTITY_TYPES:
                    entities.append({
                        'text': ent.text,
                        'type': ent.label_,
                        'start': ent.start_char,
                        'end': ent.end_char
                    })
            logger.debug(f"Extracted {len(entities)} entities")
            return entities
        except Exception as e:
            logger.exception("Error extracting entities")
            return []

    def extract_claims(self, text: str, min_length: int = 10) -> List[Dict[str, Any]]:
        """
        Extract key claims and named entities from input text.

        Args:
            text: Input text (e.g., news post, social media statement)
            min_length: Minimum length for a sentence to be considered a claim

        Returns:
            List of claim dictionaries with 'text', 'type', and 'entities'
        """
        try:
            logger.info(f"Extracting claims from text ({len(text)} chars)")
            doc = self.nlp(text)
            entities = self.extract_entities(doc)

            # Extract sentences as potential claims
            claims = []
            for sent in doc.sents:
                sent_text = sent.text.strip()
                if len(sent_text) >= min_length:
                    # Find entities in this sentence
                    sent_entities = [
                        e for e in entities
                        if e['start'] >= sent.start_char and e['end'] <= sent.end_char
                    ]

                    claims.append({
                        'text': sent_text,
                        'type': 'statement',
                        'entities': sent_entities
                    })

            # If no claims extracted, treat entire text as one claim
            if not claims:
                logger.debug("No sentences found, using entire text as claim")
                claims.append({
                    'text': text.strip(),
                    'type': 'statement',
                    'entities': entities
                })

            logger.info(f"Extracted {len(claims)} claim(s)")
            return claims
        except Exception as e:
            logger.exception("Error extracting claims")
            # Return fallback claim
            return [{
                'text': text.strip(),
                'type': 'statement',
                'entities': []
            }]


class FactsDatabase:
    """
    Manages the verified facts database and vector store.
    Handles loading, embedding, and persistence.
    """

    DEFAULT_CSV_PATH = "verified_facts_db.csv"
    DEFAULT_INDEX_PATH = "faiss_index_facts"
    EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"

    def __init__(self, api_key: str = None):
        """
        Initialize the FactsDatabase.

        Args:
            api_key: Together AI API key (optional, can use get_together_api_key)
        """
        logger.info("Initializing FactsDatabase")
        self.api_key = api_key or get_together_api_key()

        try:
            self.embeddings = TogetherEmbeddings(
                model=self.EMBEDDING_MODEL,
                api_key=self.api_key
            )
            logger.info(f"Embeddings initialized with model: {self.EMBEDDING_MODEL}")

            # Initialize ClaimExtractor for entity extraction from facts
            self.claim_extractor = ClaimExtractor()
            logger.info("ClaimExtractor initialized for database entity extraction")

        except Exception as e:
            logger.exception("Error initializing embeddings")
            raise

    def load_from_csv(
        self,
        csv_path: str = None,
        index_path: str = None
    ) -> str:
        """
        Load verified facts from CSV and create FAISS vector store.

        Args:
            csv_path: Path to verified facts CSV file
            index_path: Path to save FAISS index

        Returns:
            Status message with count of loaded facts
        """
        csv_path = csv_path or self.DEFAULT_CSV_PATH
        index_path = index_path or self.DEFAULT_INDEX_PATH

        try:
            logger.info(f"Loading facts from CSV: {csv_path}")
            # Read verified facts
            df = pd.read_csv(csv_path)
            logger.info(f"Loaded {len(df)} rows from CSV")

            # Handle different CSV formats
            if 'fact_text' in df.columns:
                fact_column = 'fact_text'
                logger.debug("Using 'fact_text' column")
            elif 'fact' in df.columns:
                fact_column = 'fact'
                logger.debug("Using 'fact' column")
            else:
                error_msg = "CSV must contain a 'fact' or 'fact_text' column"
                logger.error(error_msg)
                raise ValueError(error_msg)

            # Create documents with metadata
            logger.info("Creating documents with metadata")
            documents = self._create_documents(df, fact_column)
            logger.info(f"Created {len(documents)} documents")

            # Create FAISS index
            logger.info("Creating FAISS vector index...")
            vector_store = FAISS.from_documents(documents, self.embeddings)
            logger.info("FAISS index created successfully")

            # Save to disk
            logger.info(f"Saving FAISS index to: {index_path}")
            vector_store.save_local(index_path)
            logger.info("FAISS index saved successfully")

            return f"✅ Successfully loaded {len(documents)} verified facts into vector store"

        except FileNotFoundError:
            raise FileNotFoundError(f"Verified facts CSV not found at: {csv_path}")
        except Exception as e:
            raise RuntimeError(f"Error loading verified facts: {str(e)}")

    def _create_documents(
        self,
        df: pd.DataFrame,
        fact_column: str
    ) -> List[LangchainDocument]:
        """
        Create LangChain documents from DataFrame with entity extraction.

        Args:
            df: Pandas DataFrame with facts
            fact_column: Name of the column containing fact text

        Returns:
            List of LangChain documents with metadata including extracted entities
        """
        try:
            documents = []
            multi_sentence_count = 0
            pronoun_count = 0

            for idx, row in df.iterrows():
                fact_text = row[fact_column]

                # Extract fact_id if available
                if 'fact_id' in df.columns:
                    fact_id = row['fact_id']
                else:
                    fact_id = f"F{idx:03d}"

                # DATA VALIDATION: Check for multi-sentence facts
                sentences = fact_text.split('.')
                if len([s for s in sentences if s.strip()]) > 1:
                    multi_sentence_count += 1
                    logger.warning(
                        f"Fact {fact_id} contains multiple sentences ({len(sentences)} sentences). "
                        f"Consider splitting for better retrieval: {fact_text[:80]}..."
                    )

                # DATA VALIDATION: Check for unresolved pronouns
                pronouns = ['he ', 'she ', 'it ', 'they ', 'them ', 'his ', 'her ', 'their ']
                if any(pronoun in fact_text.lower() for pronoun in pronouns):
                    pronoun_count += 1
                    logger.warning(
                        f"Fact {fact_id} contains pronouns - may cause coreference issues: {fact_text[:80]}..."
                    )

                # ENTITY EXTRACTION: Extract entities from fact text
                entities = []
                entities_dict = {}
                try:
                    claims = self.claim_extractor.extract_claims(fact_text)
                    if claims and len(claims) > 0:
                        entities = claims[0].get('entities', [])
                        # Convert entities list to dict for easier access
                        entities_dict = {
                            'organizations': [e['text'] for e in entities if e['type'] in ['ORG', 'ORGANIZATION']],
                            'locations': [e['text'] for e in entities if e['type'] in ['GPE', 'LOC', 'LOCATION']],
                            'persons': [e['text'] for e in entities if e['type'] in ['PERSON', 'PER']],
                            'dates': [e['text'] for e in entities if e['type'] == 'DATE'],
                            'percentages': [e['text'] for e in entities if e['type'] in ['PERCENT', 'PERCENTAGE']],
                            'money': [e['text'] for e in entities if e['type'] in ['MONEY', 'CURRENCY']],
                            'all_entities': [e['text'] for e in entities]
                        }
                        logger.debug(f"Fact {fact_id}: Extracted {len(entities)} entities")
                except Exception as e:
                    logger.warning(f"Failed to extract entities from fact {fact_id}: {str(e)}")

                # Create metadata with entities
                metadata = {
                    'source': row.get('source', 'Verified Database'),
                    'date': row.get('date', 'N/A'),
                    'category': row.get('category', 'General'),
                    'fact_id': fact_id,
                    'entities': entities,  # Full entity list with types
                    'entities_dict': entities_dict  # Organized by type for easy filtering
                }

                # Create LangChain document with metadata
                doc = LangchainDocument(
                    page_content=fact_text,
                    metadata=metadata
                )
                documents.append(doc)

            # Summary logging
            logger.info(f"Created {len(documents)} documents from DataFrame")
            if multi_sentence_count > 0:
                logger.warning(
                    f"⚠️  {multi_sentence_count}/{len(documents)} facts contain multiple sentences. "
                    f"Consider atomic splitting for better granularity."
                )
            if pronoun_count > 0:
                logger.warning(
                    f"⚠️  {pronoun_count}/{len(documents)} facts contain pronouns. "
                    f"Consider coreference resolution."
                )

            # Log entity extraction statistics
            total_entities = sum(len(doc.metadata.get('entities', [])) for doc in documents)
            avg_entities = total_entities / len(documents) if documents else 0
            logger.info(
                f"Entity extraction complete: {total_entities} total entities "
                f"({avg_entities:.1f} avg per fact)"
            )

            return documents
        except Exception as e:
            logger.exception("Error creating documents from DataFrame")
            raise


class FactRetriever:
    """
    Retrieves similar facts from the vector store using semantic search.
    Implements retrieval strategies and similarity scoring.
    """

    DEFAULT_INDEX_PATH = "faiss_index_facts"
    EMBEDDING_MODEL = "BAAI/bge-base-en-v1.5"

    def __init__(self, api_key: str = None, index_path: str = None):
        """
        Initialize the FactRetriever.

        Args:
            api_key: Together AI API key
            index_path: Path to FAISS index
        """
        self.api_key = api_key or get_together_api_key()
        self.index_path = index_path or self.DEFAULT_INDEX_PATH
        logger.info(f"Initializing FactRetriever with index path: {self.index_path}")

        try:
            self.embeddings = TogetherEmbeddings(
                model=self.EMBEDDING_MODEL,
                api_key=self.api_key
            )
            logger.info(f"Embeddings model initialized: {self.EMBEDDING_MODEL}")
        except Exception as e:
            logger.exception("Error initializing embeddings model")
            raise

        self._vector_store = None

    @property
    def vector_store(self):
        """Lazy load vector store to avoid unnecessary I/O."""
        if self._vector_store is None:
            try:
                logger.info(f"Loading FAISS index from: {self.index_path}")
                self._vector_store = FAISS.load_local(
                    self.index_path,
                    self.embeddings,
                    allow_dangerous_deserialization=True
                )
                logger.info("FAISS index loaded successfully")
            except FileNotFoundError:
                error_msg = f"FAISS index not found at: {self.index_path}. Please initialize the database first."
                logger.error(error_msg)
                raise FileNotFoundError(error_msg)
            except Exception as e:
                logger.exception("Error loading FAISS index")
                raise RuntimeError(f"Error loading FAISS index: {str(e)}")
        return self._vector_store

    def retrieve(
        self,
        claim: str,
        top_k: int = 3,
        similarity_threshold: float = 0.0
    ) -> List[Dict[str, Any]]:
        """
        Retrieve most similar verified facts for a given claim.

        Args:
            claim: The claim text to verify
            top_k: Number of similar facts to retrieve
            similarity_threshold: Minimum similarity score (0-1)

        Returns:
            List of dictionaries with 'fact', 'metadata', and 'similarity'
        """
        try:
            logger.info(f"Retrieving top-{top_k} facts for claim: {claim[:100]}...")

            # Perform similarity search with scores
            docs_with_scores = self.vector_store.similarity_search_with_score(
                claim, k=top_k
            )
            logger.debug(f"Retrieved {len(docs_with_scores)} documents from FAISS")

            # Format and filter results
            similar_facts = []
            for doc, score in docs_with_scores:
                # FAISS returns distance, convert to similarity
                similarity = self._normalize_similarity(score)

                if similarity >= similarity_threshold:
                    similar_facts.append({
                        'fact': doc.page_content,
                        'metadata': doc.metadata,
                        'similarity': round(similarity, 3)
                    })
                    logger.debug(f"Fact similarity: {similarity:.3f} - {doc.page_content[:50]}...")

            logger.info(f"Filtered to {len(similar_facts)} facts above threshold {similarity_threshold}")
            return similar_facts

        except Exception as e:
            logger.exception("Error retrieving similar facts")
            raise RuntimeError(f"Error retrieving similar facts: {str(e)}")

    @staticmethod
    def _normalize_similarity(distance: float) -> float:
        """
        Convert FAISS distance to similarity score (0-1 range).

        Args:
            distance: FAISS distance score (lower = more similar)

        Returns:
            Normalized similarity score
        """
        return 1 / (1 + distance)


class ClaimClassifier:
    """
    Uses LLM to classify claims as True/False/Unverifiable.
    Handles prompt engineering and response parsing.
    """

    LLM_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
    TEMPERATURE = 0.3

    # Verdict constants
    VERDICT_TRUE = "Likely True"
    VERDICT_FALSE = "Likely False"
    VERDICT_UNVERIFIABLE = "Unverifiable"

    def __init__(self, api_key: str = None):
        """
        Initialize the ClaimClassifier.

        Args:
            api_key: Together AI API key
        """
        self.api_key = api_key or get_together_api_key()
        logger.info(f"Initializing ClaimClassifier with model: {self.LLM_MODEL}")

        try:
            self.llm = ChatTogether(
                model=self.LLM_MODEL,
                temperature=self.TEMPERATURE,
                api_key=self.api_key
            )
            logger.info(f"LLM initialized successfully (temperature={self.TEMPERATURE})")
        except Exception as e:
            logger.exception("Error initializing LLM")
            raise

    def classify(
        self,
        claim: str,
        retrieved_facts: List[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """
        Classify a claim against retrieved facts using LLM.

        Args:
            claim: The original claim to verify
            retrieved_facts: List of similar facts with metadata

        Returns:
            Dictionary with 'verdict', 'confidence', 'reasoning', 'evidence_used'
        """
        logger.info(f"Classifying claim with {len(retrieved_facts)} retrieved facts")

        # Build prompt with evidence
        prompt = self._build_prompt(claim, retrieved_facts)
        logger.debug(f"Built prompt with {len(prompt)} characters")

        try:
            # Get LLM response
            logger.info("Invoking LLM for claim classification")
            response = self.llm.invoke([{"role": "user", "content": prompt}])
            response_text = response.content.strip()
            logger.debug(f"LLM response received ({len(response_text)} chars)")

            # Parse JSON response
            result = self._parse_response(response_text)
            logger.info(f"Classification result: {result['verdict']} (confidence: {result['confidence']})")

            # Add retrieved facts as evidence details
            result['evidence_details'] = retrieved_facts

            return result

        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing failed: {str(e)}")
            return self._fallback_response(retrieved_facts, "JSON parsing failed")
        except Exception as e:
            logger.exception("Error during claim classification")
            return self._fallback_response(retrieved_facts, str(e))

    def _build_prompt(
        self,
        claim: str,
        retrieved_facts: List[Dict[str, Any]]
    ) -> str:
        """
        Build the classification prompt for the LLM.

        Args:
            claim: The claim to verify
            retrieved_facts: Retrieved evidence

        Returns:
            Formatted prompt string
        """
        # Format evidence
        evidence_text = self._format_evidence(retrieved_facts)

        # Construct prompt
        prompt = f"""You are a fact-checking assistant. Your task is to verify the following claim against verified evidence.

CLAIM TO VERIFY:
"{claim}"

VERIFIED EVIDENCE FROM DATABASE:
{evidence_text}

INSTRUCTIONS:
1. Compare the claim against the verified evidence carefully
2. Classify the claim as one of:
   - "{self.VERDICT_TRUE}" - if evidence strongly supports the claim
   - "{self.VERDICT_FALSE}" - if evidence contradicts the claim
   - "{self.VERDICT_UNVERIFIABLE}" - if insufficient or conflicting evidence

3. Provide your analysis in EXACTLY this JSON format (no additional text):
{{
  "verdict": "{self.VERDICT_TRUE}" | "{self.VERDICT_FALSE}" | "{self.VERDICT_UNVERIFIABLE}",
  "confidence": "high" | "medium" | "low",
  "reasoning": "Explain your decision in 2-3 sentences",
  "evidence_used": ["fact 1", "fact 2"]
}}

IMPORTANT:
- Be objective and base your verdict only on the evidence provided
- If the evidence is vague or irrelevant, mark as "{self.VERDICT_UNVERIFIABLE}"
- Consider dates, entities, and specific details when comparing
- Return ONLY the JSON object, no other text

YOUR RESPONSE:"""

        return prompt

    def _format_evidence(self, retrieved_facts: List[Dict[str, Any]]) -> str:
        """
        Format retrieved facts for the prompt.

        Args:
            retrieved_facts: List of facts with metadata

        Returns:
            Formatted evidence string
        """
        if not retrieved_facts:
            return "No similar verified facts found in the database."

        evidence_lines = []
        for i, fact in enumerate(retrieved_facts, 1):
            lines = [
                f"Evidence {i}:",
                f"{fact['fact']}",
                f"Source: {fact['metadata'].get('source', 'Unknown')}",
                f"Date: {fact['metadata'].get('date', 'Unknown')}",
                f"Similarity: {fact['similarity']:.2f}"
            ]
            evidence_lines.append("\n".join(lines))

        return "\n\n".join(evidence_lines)

    def _parse_response(self, response_text: str) -> Dict[str, Any]:
        """
        Parse LLM JSON response.

        Args:
            response_text: Raw LLM response

        Returns:
            Parsed result dictionary
        """
        try:
            # Try to extract JSON if LLM added extra text
            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
            if json_match:
                response_text = json_match.group(0)
                logger.debug("Extracted JSON from LLM response")

            result = json.loads(response_text)
            logger.debug("Successfully parsed JSON response")

            # Validate required fields
            required_fields = ['verdict', 'confidence', 'reasoning', 'evidence_used']
            missing_fields = [field for field in required_fields if field not in result]

            if missing_fields:
                logger.warning(f"Missing fields in LLM response: {missing_fields}")
                for field in missing_fields:
                    result[field] = "Unknown" if field != 'evidence_used' else []

            return result
        except Exception as e:
            logger.exception("Error parsing LLM response")
            raise

    def _fallback_response(
        self,
        retrieved_facts: List[Dict[str, Any]],
        error_msg: str
    ) -> Dict[str, Any]:
        """
        Create fallback response on error.

        Args:
            retrieved_facts: Retrieved evidence
            error_msg: Error message

        Returns:
            Fallback response dictionary
        """
        logger.warning(f"Creating fallback response due to: {error_msg}")
        return {
            'verdict': self.VERDICT_UNVERIFIABLE,
            'confidence': 'low',
            'reasoning': f'Error during fact-checking: {error_msg}',
            'evidence_used': [],
            'evidence_details': retrieved_facts,
            'error': error_msg
        }


class FactChecker:
    """
    Main orchestrator for the fact-checking pipeline.
    Coordinates ClaimExtractor, FactRetriever, and ClaimClassifier.
    Follows Facade pattern to provide simple interface.
    """

    def __init__(self, api_key: str = None):
        """
        Initialize the FactChecker with all required components.

        Args:
            api_key: Together AI API key
        """
        logger.info("Initializing FactChecker pipeline")
        self.api_key = api_key or get_together_api_key()

        try:
            # Initialize components (Dependency Injection)
            logger.debug("Initializing ClaimExtractor")
            self.claim_extractor = ClaimExtractor()

            logger.debug("Initializing FactRetriever")
            self.fact_retriever = FactRetriever(api_key=self.api_key)

            logger.debug("Initializing ClaimClassifier")
            self.claim_classifier = ClaimClassifier(api_key=self.api_key)

            logger.info("FactChecker initialization complete")
        except Exception as e:
            logger.exception("Error initializing FactChecker")
            raise

    def check_claim(self, user_claim: str, top_k: int = 3) -> Dict[str, Any]:
        """
        Main fact-checking pipeline that orchestrates the entire process.

        Args:
            user_claim: User's input claim/statement to verify
            top_k: Number of similar facts to retrieve

        Returns:
            Complete fact-check result with verdict, evidence, and reasoning
        """
        logger.info("=" * 60)
        logger.info(f"Starting fact-check pipeline for claim: {user_claim[:100]}...")
        logger.info("=" * 60)

        try:
            # Step 1: Extract claims from input
            logger.info("Step 1: Extracting claims from input")
            claims = self.claim_extractor.extract_claims(user_claim)

            # For simplicity, fact-check the first/main claim
            main_claim = claims[0]['text'] if claims else user_claim
            logger.info(f"Main claim identified: {main_claim[:100]}...")

            # Step 2: Retrieve similar facts
            logger.info(f"Step 2: Retrieving top-{top_k} similar facts")
            similar_facts = self.fact_retriever.retrieve(main_claim, top_k=top_k)
            logger.info(f"Retrieved {len(similar_facts)} similar facts")

            # Step 3: Classify using LLM
            logger.info("Step 3: Classifying claim using LLM")
            result = self.claim_classifier.classify(main_claim, similar_facts)

            # Step 4: Add metadata
            logger.info("Step 4: Adding metadata to result")
            result['original_input'] = user_claim
            result['extracted_claim'] = main_claim
            result['entities_found'] = claims[0].get('entities', []) if claims else []
            result['total_claims_extracted'] = len(claims)

            logger.info(f"Fact-check complete: {result['verdict']}")
            logger.info("=" * 60)
            return result

        except Exception as e:
            logger.exception("Error in fact-checking pipeline")
            logger.info("=" * 60)
            return self._error_response(user_claim, str(e))

    def _error_response(self, user_claim: str, error_msg: str) -> Dict[str, Any]:
        """
        Create error response when pipeline fails.

        Args:
            user_claim: Original user claim
            error_msg: Error message

        Returns:
            Error response dictionary
        """
        logger.error(f"Creating error response for claim: {error_msg}")
        return {
            'verdict': 'Unverifiable',
            'confidence': 'low',
            'reasoning': f'Error during fact-checking pipeline: {error_msg}',
            'evidence_used': [],
            'evidence_details': [],
            'original_input': user_claim,
            'extracted_claim': user_claim,
            'entities_found': [],
            'error': error_msg
        }


# ========================================================================
# LEGACY FUNCTION WRAPPERS (for backward compatibility)
# ========================================================================

def load_verified_facts(csv_path: str = "verified_facts_db.csv") -> str:
    """
    Legacy wrapper for backward compatibility.
    Uses FactsDatabase class internally.

    Args:
        csv_path: Path to verified facts CSV file

    Returns:
        Status message
    """
    db = FactsDatabase()
    return db.load_from_csv(csv_path)


def retrieve_similar_facts(
    claim: str,
    top_k: int = 3,
    similarity_threshold: float = 0.0
) -> List[Dict[str, Any]]:
    """
    Legacy wrapper for backward compatibility.
    Uses FactRetriever class internally.

    Args:
        claim: The claim text to verify
        top_k: Number of similar facts to retrieve
        similarity_threshold: Minimum similarity score (0-1)

    Returns:
        List of dictionaries with 'fact', 'metadata', and 'similarity'
    """
    retriever = FactRetriever()
    return retriever.retrieve(claim, top_k, similarity_threshold)


def classify_claim(claim: str, retrieved_facts: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Legacy wrapper for backward compatibility.
    Uses ClaimClassifier class internally.

    Args:
        claim: The original claim to verify
        retrieved_facts: List of similar facts with metadata

    Returns:
        Dictionary with 'verdict', 'confidence', 'reasoning', 'evidence_used'
    """
    classifier = ClaimClassifier()
    return classifier.classify(claim, retrieved_facts)


def fact_check_claim(user_claim: str, top_k: int = 3) -> Dict[str, Any]:
    """
    Legacy wrapper for backward compatibility.
    Uses FactChecker class internally.

    Args:
        user_claim: User's input claim/statement to verify
        top_k: Number of similar facts to retrieve

    Returns:
        Complete fact-check result with verdict, evidence, and reasoning
    """
    checker = FactChecker()
    return checker.check_claim(user_claim, top_k)