Spaces:

devrajsinh2012
/

Mexar

Sleeping

File size: 7,781 Bytes

b0b150b

"""
MEXAR - Source Attribution Module
Links each sentence in the answer to its supporting source chunk.
Provides inline citations for full transparency.
"""
import re
import logging
from typing import List, Dict, Tuple, Any
from dataclasses import dataclass, field
import numpy as np

logger = logging.getLogger(__name__)


@dataclass
class AttributedSentence:
    """A sentence with its source attribution."""
    text: str
    citation: str
    source_chunk_id: int
    source_preview: str
    source_file: str
    similarity: float


@dataclass
class AttributedAnswer:
    """Complete answer with all attributions."""
    answer_with_citations: str
    sentences: List[AttributedSentence]
    sources: List[Dict]


class SourceAttributor:
    """
    Attributes each sentence in an LLM answer to its source chunk.
    
    This enables:
    1. Inline citations [1], [2], etc.
    2. Verification of claims against source data
    3. Transparency about where information came from
    """
    
    def __init__(self, embedding_model=None):
        """
        Initialize attributor.
        
        Args:
            embedding_model: FastEmbed model for sentence embedding
        """
        self.embedding_model = embedding_model
    
    def attribute(
        self, 
        answer: str, 
        chunks: List[Any],
        chunk_embeddings: List[np.ndarray] = None
    ) -> AttributedAnswer:
        """
        Attribute each sentence in answer to source chunks.
        
        Args:
            answer: LLM generated answer
            chunks: Retrieved DocumentChunk objects  
            chunk_embeddings: Pre-computed embeddings (optional)
            
        Returns:
            AttributedAnswer with citations
        """
        if not answer or not chunks:
            return AttributedAnswer(
                answer_with_citations=answer,
                sentences=[],
                sources=[]
            )
        
        # Split answer into sentences
        sentences = self._split_sentences(answer)
        
        # Compute chunk embeddings if not provided
        if chunk_embeddings is None and self.embedding_model:
            contents = [self._get_content(c) for c in chunks]
            chunk_embeddings = list(self.embedding_model.embed(contents))
        
        # Track which sources we've cited
        sources_used = {}  # chunk_id -> citation_number
        attributed_sentences = []
        
        for sentence in sentences:
            # Skip very short or non-substantive sentences
            if len(sentence.split()) < 4:
                continue
            
            # Find best matching chunk
            best_chunk, similarity = self._find_best_source(
                sentence, chunks, chunk_embeddings
            )
            
            # Assign citation number
            chunk_id = self._get_id(best_chunk)
            if chunk_id not in sources_used:
                sources_used[chunk_id] = len(sources_used) + 1
            citation_num = sources_used[chunk_id]
            
            attributed_sentences.append(AttributedSentence(
                text=sentence,
                citation=f"[{citation_num}]",
                source_chunk_id=chunk_id,
                source_preview=self._get_content(best_chunk)[:150],
                source_file=self._get_source(best_chunk),
                similarity=similarity
            ))
        
        # Build answer with inline citations
        answer_with_citations = self._build_cited_answer(answer, attributed_sentences)
        
        # Build sources list for display
        sources = []
        for chunk_id, num in sorted(sources_used.items(), key=lambda x: x[1]):
            # Find the attributed sentence for this chunk
            attr = next((a for a in attributed_sentences if a.source_chunk_id == chunk_id), None)
            if attr:
                sources.append({
                    "citation": f"[{num}]",
                    "chunk_id": chunk_id,
                    "source": attr.source_file,
                    "preview": attr.source_preview,
                    "similarity": round(attr.similarity, 3)
                })
        
        return AttributedAnswer(
            answer_with_citations=answer_with_citations,
            sentences=attributed_sentences,
            sources=sources
        )
    
    def _split_sentences(self, text: str) -> List[str]:
        """Split text into sentences."""
        # Split on sentence-ending punctuation followed by space
        sentences = re.split(r'(?<=[.!?])\s+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def _find_best_source(
        self, 
        sentence: str, 
        chunks: List[Any],
        chunk_embeddings: List[np.ndarray]
    ) -> Tuple[Any, float]:
        """Find the chunk most similar to the sentence."""
        if not chunks:
            return None, 0.0
        
        # Default to first chunk if no embeddings
        if not self.embedding_model or not chunk_embeddings:
            return chunks[0], 0.5
        
        try:
            # Embed the sentence
            sentence_emb = list(self.embedding_model.embed([sentence]))[0]
            
            # Find best match
            best_chunk = chunks[0]
            best_sim = 0.0
            
            for chunk, emb in zip(chunks, chunk_embeddings):
                sim = self._cosine_similarity(sentence_emb, emb)
                if sim > best_sim:
                    best_sim = sim
                    best_chunk = chunk
            
            return best_chunk, best_sim
            
        except Exception as e:
            logger.warning(f"Embedding failed in attribution: {e}")
            return chunks[0], 0.5
    
    def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> float:
        """Calculate cosine similarity between two vectors."""
        try:
            dot = np.dot(a, b)
            norm_a = np.linalg.norm(a)
            norm_b = np.linalg.norm(b)
            if norm_a == 0 or norm_b == 0:
                return 0.0
            return float(dot / (norm_a * norm_b))
        except:
            return 0.0
    
    def _build_cited_answer(
        self, 
        answer: str, 
        attributed: List[AttributedSentence]
    ) -> str:
        """Insert citations after sentences in the answer."""
        result = answer
        
        # Process in reverse order to preserve positions
        for attr in reversed(attributed):
            # Add citation after the sentence
            if attr.text in result:
                result = result.replace(
                    attr.text, 
                    f"{attr.text} {attr.citation}",
                    1  # Only replace first occurrence
                )
        
        return result
    
    def _get_content(self, chunk) -> str:
        """Extract content from chunk object."""
        if hasattr(chunk, 'content'):
            return chunk.content
        elif isinstance(chunk, dict):
            return chunk.get('content', '')
        return str(chunk)
    
    def _get_id(self, chunk) -> int:
        """Extract ID from chunk object."""
        if hasattr(chunk, 'id'):
            return chunk.id
        elif isinstance(chunk, dict):
            return chunk.get('id', 0)
        return 0
    
    def _get_source(self, chunk) -> str:
        """Extract source from chunk object."""
        if hasattr(chunk, 'source'):
            return chunk.source or "unknown"
        elif isinstance(chunk, dict):
            return chunk.get('source', 'unknown')
        return "unknown"


def create_source_attributor(embedding_model=None) -> SourceAttributor:
    """Factory function to create SourceAttributor."""
    return SourceAttributor(embedding_model)