Spaces:

devrajsinh2012
/

Mexar

Sleeping

File size: 10,157 Bytes

53bb779

"""
MEXAR - Faithfulness Scoring Module
Measures how well the LLM answer is grounded in the retrieved context.
"""
import json
import logging
from typing import List, Dict
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class FaithfulnessResult:
    """Result of faithfulness evaluation."""
    score: float  # 0-1 score
    total_claims: int
    supported_claims: int
    unsupported_claims: List[str]


class FaithfulnessScorer:
    """
    Evaluates how faithful (grounded) an LLM answer is to the context.
    
    Process:
    1. Extract factual claims from the answer
    2. Check each claim against the retrieved context
    3. Calculate percentage of supported claims
    
    High faithfulness = answer is well-grounded, low hallucination risk
    """
    
    def __init__(self, groq_client=None):
        """
        Initialize scorer.
        
        Args:
            groq_client: Groq client for LLM calls
        """
        self._client = groq_client
    
    @property
    def client(self):
        """Lazy load Groq client."""
        if self._client is None:
            from utils.groq_client import get_groq_client
            self._client = get_groq_client()
        return self._client
    
    def score(self, answer: str, context: str) -> FaithfulnessResult:
        """
        Score how well answer is grounded in context.
        
        Args:
            answer: LLM generated answer
            context: Retrieved context used to generate answer
            
        Returns:
            FaithfulnessResult with score and details
        """
        if not answer or not context:
            return FaithfulnessResult(
                score=1.0, 
                total_claims=0, 
                supported_claims=0, 
                unsupported_claims=[]
            )
        
        # Step 1: Extract claims from answer
        claims = self._extract_claims(answer)
        
        if not claims:
            return FaithfulnessResult(
                score=1.0,
                total_claims=0,
                supported_claims=0,
                unsupported_claims=[]
            )
        
        # Step 2: Check each claim against context
        supported = 0
        unsupported = []
        
        for claim in claims:
            if self._is_supported(claim, context):
                supported += 1
            else:
                unsupported.append(claim)
        
        # Step 3: Calculate score
        score = supported / len(claims)
        
        logger.info(f"Faithfulness: {supported}/{len(claims)} claims supported ({score*100:.0f}%)")
        
        return FaithfulnessResult(
            score=round(score, 3),
            total_claims=len(claims),
            supported_claims=supported,
            unsupported_claims=unsupported[:5]  # Limit to 5 for display
        )
    
    def _extract_claims(self, answer: str) -> List[str]:
        """
        Extract factual claims from the answer.
        
        Uses LLM to identify distinct factual statements.
        """
        try:
            prompt = f"""Extract individual factual claims from this answer. 
A claim is a specific statement that can be verified as true or false.
Return ONLY a JSON array of strings, no explanation.

Answer: "{answer[:2000]}"

Example output: ["Claim 1", "Claim 2", "Claim 3"]"""

            response = self.client.analyze_with_system_prompt(
                system_prompt="You extract factual claims. Return only valid JSON array.",
                user_message=prompt,
                model="fast",
                json_mode=True
            )
            
            # Parse response
            claims = json.loads(response)
            
            # Handle both list and dict responses
            if isinstance(claims, list):
                return [str(c) for c in claims if c]
            elif isinstance(claims, dict):
                return [str(c) for c in claims.get("claims", claims.get("statements", [])) if c]
            
            return []
            
        except json.JSONDecodeError as e:
            logger.warning(f"Failed to parse claims JSON: {e}")
            # Fallback: split by sentences
            return self._fallback_extract_claims(answer)
        except Exception as e:
            logger.warning(f"Claim extraction failed: {e}")
            return self._fallback_extract_claims(answer)
    
    def _fallback_extract_claims(self, answer: str) -> List[str]:
        """Fallback claim extraction by splitting sentences."""
        import re
        sentences = re.split(r'(?<=[.!?])\s+', answer)
        # Filter to substantive sentences
        return [s.strip() for s in sentences if len(s.strip()) > 20][:10]
    
    def _is_supported(self, claim: str, context: str) -> bool:
        """
        Check if a claim is supported by the context.
        
        Uses LLM to evaluate if the context contains evidence for the claim.
        """
        try:
            prompt = f"""Is this claim supported by the context? Answer only YES or NO.

Claim: "{claim}"

Context (first 4000 chars):
"{context[:4000]}"

Answer YES if the context contains information that supports this claim.
Answer NO if the claim cannot be verified from the context or contradicts it."""

            response = self.client.analyze_with_system_prompt(
                system_prompt="You verify claims. Answer only YES or NO.",
                user_message=prompt,
                model="fast"
            )
            
            return "YES" in response.upper()
            
        except Exception as e:
            logger.warning(f"Support check failed: {e}")
            # Optimistic fallback - assume supported if check fails
            return True
    
    def quick_score(self, answer: str, context: str) -> float:
        """
        Quick faithfulness estimate without LLM calls.
        Uses text overlap as a proxy for grounding.
        
        Args:
            answer: LLM answer
            context: Retrieved context
            
        Returns:
            Estimated faithfulness score (0-1)
        """
        if not answer or not context:
            return 0.5
        
        # Get significant words from answer
        answer_words = set(w.lower() for w in answer.split() if len(w) > 4)
        context_lower = context.lower()
        
        if not answer_words:
            return 0.5
        
        # Check how many answer words appear in context
        found = sum(1 for w in answer_words if w in context_lower)
        overlap = found / len(answer_words)
        
        # Scale to reasonable range
        return min(1.0, overlap * 1.5)


def create_faithfulness_scorer() -> FaithfulnessScorer:
    """Factory function to create FaithfulnessScorer."""
    return FaithfulnessScorer()


class BartNLIScorer:
    """
    Evaluates faithfulness using a local NLI model (BART-Large-MNLI) 
    to break the circular evaluation where the generator evaluates itself.
    """
    def __init__(self):
        self._pipe = None
        
    @property
    def pipe(self):
        if self._pipe is None:
            import logging
            logger = logging.getLogger(__name__)
            try:
                from transformers import pipeline
                logger.info("Loading BART NLI model...")
                # 'contradiction' (0), 'neutral' (1), 'entailment' (2)
                self._pipe = pipeline("text-classification", model="facebook/bart-large-mnli")
                logger.info("BART NLI loaded.")
            except ImportError:
                logger.error("transformers not installed. Cannot use BartNLIScorer.")
                self._pipe = "UNAVAILABLE"
        return self._pipe

    def score(self, answer: str, context: str) -> FaithfulnessResult:
        if not answer or not context or self.pipe == "UNAVAILABLE":
            return FaithfulnessResult(score=1.0, total_claims=0, supported_claims=0, unsupported_claims=[])
            
        import re
        sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', answer) if len(s.strip()) > 20][:10]
        if not sentences:
            return FaithfulnessResult(score=1.0, total_claims=0, supported_claims=0, unsupported_claims=[])
            
        supported = 0
        unsupported = []
        
        try:
            for sentence in sentences:
                # Format for bart-large-mnli: premise </s></s> hypothesis
                input_text = f"{context[:3000]} </s></s> {sentence}"
                result = self.pipe(input_text, truncation=True, max_length=1024)[0]
                label = result['label'].lower()
                # Consider neutral or entailment as supported for broad QA, or strict entailment
                if 'entail' in label:
                    supported += 1
                else:
                    unsupported.append(sentence)
        except Exception as e:
            logger.error(f"BART NLI Error: {e}")
            return FaithfulnessResult(score=0.5, total_claims=len(sentences), supported_claims=0, unsupported_claims=sentences[:5])

        score = supported / len(sentences)
        logger.info(f"BART NLI Faithfulness: {supported}/{len(sentences)} claims supported ({score*100:.0f}%)")
        return FaithfulnessResult(
            score=round(score, 3),
            total_claims=len(sentences),
            supported_claims=supported,
            unsupported_claims=unsupported[:5]
        )


class FActScoreCompat:
    """
    Simulates the FActScore (Min et al., ACL 2023) evaluation.
    Breaks answer into atomic facts, verifies each fact against context independently.
    This acts as a wrapper around FaithfulnessScorer to explicitly mark it for FActScore baseline comparisons.
    """
    def __init__(self, groq_client=None):
        self._scorer = FaithfulnessScorer(groq_client=groq_client)
        
    def score(self, answer: str, context: str) -> FaithfulnessResult:
        result = self._scorer.score(answer, context)
        logger.info(f"FActScore: {result.score * 100:.1f}% ({result.supported_claims}/{result.total_claims} facts)")
        return result